#!/usr/local/bin/perl
# Copyright © 2001 by James F. Carter
# Permission is granted for any use provided this copyright notice
# remains intact.  Specifically, if you want to hack this for another language,
# feel free.

# Issues:
#   %	Suppress 1-letter "words".
#   .	Unsuppress letterals.
#   .	Provide for incoming and outgoing xrefs.
#   .	Multi-word queries
#   .	Order the hits by file position.
#   %	What is "modal~test" doing as a key?  (from English)
#   %	What is 2.4.5,4.6.3 doing as a key?
#   %	Only kill keys from stoppable areas.  (Leave g-vo).
#   %	Killing seems not to actually kill anything.

# CGI script to search the xankua database.  
# Put this in your HTML document (suitably formatted) (note the field names):
#   <form action="xankua.cgi" method=get>
#   <label>Word to look up: <input type=text name=word ></label>
#   <label>Max responses: <input type=text name=max value="10"></label>
#   Regions to search:
#	<input type=checkbox name=g value="yes">Gua\spi word
#	<input type=checkbox name=x value="yes">Words xreffing this one
#	<input type=checkbox name=d value="yes">Definition
#	<input type=checkbox name=e value="yes">English keywords
#	<input type=checkbox name=t value="yes">Thesaurus category
#	<input type=checkbox name=a value="yes">All fields
#   <input type=checkbox name=xref value="yes">Show cross references
#   <input type=hidden name=referer value="">
#   <input type=submit name=submit accesskey="s">
#   </form>
# The "referer" field is for the URL of the original referrer, i.e. the form
# that called this script, in case this script calls itself multiple times.
# The program attempts to include this referrer document at the end of the
# output, provided a file of the same basename exists in the same directory as
# the script.

# The database is sought in the same directory where the script is.
# If a command line argument of -R is given, the database is rebuilt.
# Filenames: 
#   xankua.dat		Text table
#   xankua.{dir,pag}	DBM version of database

# As presently programmed, the file has 1756 lines (including thesaurus 
# categories), 12452 keys, 21528 targets.

		# Command line arguments (must be in this order):
		#   -f basename		Basename of database (for debug)
		#   -R			Rebuild database
$basename = "xankua";
if ($ARGV[0] eq "-f") {
    $basename = $ARGV[1];
    splice(@ARGV, 0, 2);
}
$rebuild = ($ARGV[0] eq "-R");

		# Compute the name of the database
$cgi = ($ENV{GATEWAY_INTERFACE} ne '');
$script = $cgi ? $ENV{SCRIPT_FILENAME} : $0 =~ /\// ? $0 : "./$0";
$dir = $script;
substr($dir, rindex($dir, '/')) = '';
$dbname = "$dir/$basename";

		# Redirect stderr to stdout.  
open(STDERR, ">&STDOUT") or die "Can't redirect STDERR to STDOUT: $!\n" if $cgi;
		# Compute name of the HTML file.
$htmlfile = $script;
substr($htmlfile, rindex($htmlfile, '.')) = ".html";
open(DATA, $htmlfile) || die "Failed to open HTML file $htmlfile: $!\n"
	if -r $htmlfile && !$rebuild;

		# Print standard header area.  Print HTML file up to separator.
if (!$rebuild) {
    print "Content-type: text/html\n\n";
    $sep = "<!-- Results go here -->\n";
    while (<DATA>) {
	print;
	last if $_ eq $sep;
    }
}

		# Open the database.
open(DB, "$dir/$basename.dat") or die "Can't open $dir/$basename.dat: $!\n";
use SDBM_File;			#Do it here so error messages are printed
use Fcntl;
$DB = "$dir/$basename";
@DB = ("$DB.dir", "$DB.pag");
unlink @DB if $rebuild;
# tie(%DB, 'GDBM_File', $DB,
#	($rebuild ? &GDBM_WRCREAT : &GDBM_READER), 0644)
tie(%DB, 'SDBM_File', $DB,
	($rebuild ? (O_RDWR | O_CREAT) : O_RDONLY), 0644)
	or die "Can't open database $DB: $!\n";
		# Timing data (on 500 MHz Pentium III): GDBM, ODBM, NDBM appear
		# to be identically implemented.  SDBM is slightly different.
		# Data about		GDBM	SDBM
		#   Words		1756	1756	(Base file records)
		#   Keys		12147	12147	(DB records)
		#   Targets		20531	20531	(Seek ptrs in DB recs)
		#   Base file size	187520	187520	bytes
		#   DBM file size	991232	863232	bytes
		#   Time to rebuild	3.53	3.01	secs
		#   Time		1.72e-4	1.46e-4	sec/target

		# Time to rebuild the database? 
&rebuild() if $rebuild;

		# Extract the query parameters into hash %q.
		# HTTP encoding changes spaces to + signs; change it back.
		# No need to translate other special codes.
foreach $unit (split(/\&/, $ENV{QUERY_STRING})) {
    ($k, $v) = split("=", $unit);
    $q{$k} = &decodevalue($v);
}
$q{referer} = $ENV{HTTP_REFERER} unless $q{referer} ne '';

		# Which areas are to be searched?
BEGIN {
    %area = qw(g word  t thes  e engl  d definition  x xref);
    %areax = (g => "-Gua\\spi", x => "Cross references", d => "Definition",
	e => "English keywords", t => "Thesaurus category", a => "All fields");
}
foreach $e (qw(g x d e t a)) {
    next if $q{$e} eq '';
    push(@area, $e);
    push(@areax, $areax{$e});
}
$areax = join(', ', @areax);
unless (@area) {
    print "<p>No area was selected; using word and definition.\n";
    @area = qw(g d);
    $areax = "(Missing)";
}

		# Print the query as received.
print <<EOF;
<table>
<tr><td>Query:<td>
<tr><td>Word or Category	<td>$q{word}
<tr><td>Area to Search		<td>$areax
<tr><td>Max Results		<td>$q{max}
</table>
EOF
unless ($q{max} > 0) {
    print "<p>Maximum results of `$q{max}' must be > 0; using 10.\n";
    $q{max} = 10;
}
unless ($q{word} ne '') {
    print "<p>Search word is null.  No results.\n";
    &finish();
}
		# Regexp to recognize the sought words
$regexp = quotemeta(join(' ', &normalize($q{word})));
$regexp =~ tr/ /|/;

		# Do the queries.  The subrt knows to skip duplicates.
QUERY: foreach $w (split(/[,\s]+/, $q{word})) {
    foreach $e (@area) {
	foreach $posn (split(' ', $DB{"$e$w"})) {
	    &doquery($posn) or last QUERY;
	}
    }
}
		# If cross references were requested, do that now.
if ($q{xref} ne '') {
    my %xref;
    while (($posn, $rec) = each(%hits)) {
	foreach $k (split(/[,\s]+/, $rec->{xref})) {
	    $xref{$k}++;
	}
    }
    XREF: foreach $w (keys %xref) {
	foreach $posn (split(' ', $DB{"g$w"})) {
	    &doquery($posn) or last XREF;
	}
    }
}

# Guts of the query process.  Argument: a seek position.
sub doquery {
    my($posn) = @_;
    if (!exists($hits{$posn})) {
	seek(DB, $posn, 0) or do {
	    print STDERR "<p>Failed to seek to `$posn': $!\n";
	    return 0;
	};
	$_ = readline(DB);		# Read the record for that word.
		# Matching words in bold face
	s/\b($regexp)\b/<B>$1<\/B>/igo if $regexp ne '';
	$hits{$posn} = &munpack($_);
    }
    $hits{$posn}{score}++;
    1;
}

		# Organize the results, most hits first, then ordered by file 
		# position, except thesaurus categories first.
@hits = sort {
	($hits{$b}{class} eq 't') <=> ($hits{$a}{class} eq 't') ||
	$hits{$b}{score} <=> $hits{$a}{score} || 
	$a <=> $b
    } keys %hits;
printf "<p>%d matching records found.\n", scalar(@hits);
if (@hits > $q{max}) {
    print "Showing the first $q{max}.\n";
    splice(@hits, $q{max});
}
		# Provide the column headers (if there are any results)
if (@hits > 0) {
    unshift(@hits, -1);
    $hits{"-1"} = &munpack("<B>Word<B>\tCl\tRank\t<B>Category<B>\t<B>English<B>\tPhonetic\t \t \t \t\t\t\n");
}
		# Report the results.
print '<table width="100%"><col width="10%"><col width="15%"><col width="75%">',
						"\n";
foreach $posn (@hits) {
    my $rec = $hits{$posn};
		    # Extract the "foreign" language translations
    my $xl;
    foreach $t (qw(chinese latin loglan)) {
	$v = $rec->{$t};
	$xl .= ucfirst($t) . " $v, " if $v ne '' && $v ne '-';
    }
    $xl = ' (' . substr($xl, 0, -2) . ')' if $xl ne '';
    $xl =~ s/(?<=\w) +(?=[,)])//g;
    print <<EOF;
<tr><td>${$rec}{word}	<td>${$rec}{thes} (${$rec}{class})	<td>${$rec}{engl} $xl
EOF
    print "<tr><td><td colspan=2>${$rec}{definition}\n" if ${$rec}{definition} ne '';
    print "<tr><td><td>Note:<td>${$rec}{comments}\n" if ${$rec}{comments} ne '';
    print "<tr><td><td>See also:<td>${$rec}{xref}\n" if ${$rec}{xref} ne '';
}
print "</table>\n$emsg";

&finish();
		# End of main thread.

# Appends the rest of the form to the output, and exits.
sub finish {
    while (<DATA>) {
	s/value=\".*\"/value=\"$q{referer}\"/ if /name=referer/;
	print;
    }
    exit 0;
}


# HTML-encodes values that may contain control characters.
sub xformvalue {
    my($value) = @_;
    my($unit, $c);
		#Translate all control characters except ^J (\n)
    my(@value) = split(/([\0-\011\013-\037])/, $value);
    $value = '';
    while (($unit, $c) = splice(@value, 0, 2)) {
	$value .= $unit;
	if ($c ne '') {
	    $value .= '^';
	    $value .= chr(ord($c)+0100);
	}
    }
    return $value;
}

# Inverts the HTTP-encoding process.
sub decodevalue {
    my($val) = @_;
    $val =~ tr/+/ /;		# Spaces are changed to + signs.
    my @val = split('%', $val);
    my @v2 = shift @val;
    foreach $_ (@val) {
	substr($_, 0, 2) = chr(oct("0x" . substr($_, 0, 2)));v
    }
    join('', @v2, @val);
}

# Unpack a database record.  Args: \%record, $data.
BEGIN {
    @fields = qw(word class rank thes engl phon chinese latin loglan xref definition comments);
    # word 	-gua\spi word
    # class	
    #		P = Phrase relative pronoun (modal pronouns are S's), Letteral
    #		S = Pure structure word (caselink, digit, etc.)
    #		p = A noun in English
    #		q = Relational, but not a verb in English
    #		r = Relation
    #		t = Category header
    # rank	(not used)
    # thes	Thesaurus category
    # engl	Comma separated English keywords
    # phon	Phonetic English (for making words)a
    # chinese 	Phonetic Chinese (for making words)a
    # latin 	Phonetic Latin (for making words)a
    # loglan	Old Loglan word
    # xref	Cross reference to other -gua\spi words
    # definition 
    # comments
}
sub munpack {
    my($data) = @_;
    chomp $data;
    my $rec = { };
    @{$rec}{@fields} = split("\t", $data);
    $rec;
}

# Splits the argument into words after doing these transformations:
# Arguments starting with X are removed.  Everything becomes lower case.
# Punctuation (e.g. tone symbols in Chinese) is removed.
# $bits: 1 = letteral (don't lose 1-byte codes); 2 = thesaurus category
# (don't lose punctuation).
sub normalize {
    my($data, $area) = @_;
    $data =~ s/X\S+//g;		# Lose arguments (but not letteral X)
    $data = lc($data);		# Everything becomes lower case
    $data =~ tr/,~/  /;		# Change comma and tilde into word separators
    unless ($area eq 't') {
	$data =~ s/[^\w\s]//g;	# Lose other punctuation
	$data =~ s/\b\d+\b//g;	# Lose trashed thesaurus categories
	$data =~ s/\b\w\b// unless $area eq 'd'; #Lose 1-byte words
    }
    split(' ', $data);		# Split, losing all whitespace
}

# Rebuilds the database.
# (guaspi, cognates, definition, category, all)
sub rebuild {
    my($k, $t, $j);
    my %stop = qw(is 1 a 1 the 1 and 1 by 1 to 1 of 1 in 1 for 1 case 1 at 1 that 1 as 1 about 1 do 1 but 1 as 1 no 1 can 1 it 1 has 1 are 1 eg 1 its 1 ); #Stop list
    my %stopk = qw(d 1 c 1 a 1);		# Stops apply to these areas
    my $nkeys = 0;				# Number of keys
    my $ntgts = 0;				# Number of targets
    my $dots = 50;				# Print a dot every N lines
    my $lines = 0;				# Number of words indexed
    select STDERR;
    print STDERR "Rebuilding database: \n";
    $| = 1;
    my($posn) = tell DB;
    while (<DB>) {
	if (++$lines % $dots == 0) {
	    print STDERR '.';
	}
	next if substr($_,0,1) eq '!';
	chomp;
	my $rec = &munpack($_);
	my %keys;
	my @areas;
	if (${$rec}{class} eq 't') {
		# For a thesaurus category, hit on just the category, its
		# first 2 components, and the title {member engl}.
	    next unless ${$rec}{thes} =~ /^\d+\.\d+\.\d+$/;
	    my $t = ${$rec}{thes};
	    $t =~ s/\.\d+$//;
	    $keys{$t}{t}++;
	    push(@areas, 't');
	} else {
		# For any other, hit on the word, the thesaurus category,
		# the English keywords, definition, and cross reference.
	    @areas = qw(g t e d x);
	}
	foreach $e (@areas) {
	    foreach $k (&normalize(${$rec}{$area{$e}}, $e)) {
		next if $stopk{$e} && $stop{$k};
		$keys{$k}{$e}++;
	    }
	}
		# Get the keys for the "all" category.  Skip any gotten so far.
	foreach $k (&normalize($_)) {
	    $keys{$k}{a}++ unless $stop{$k} || exists($keys{$k});
	}
		# Add the seek pointer for this record to each key 
	my(@added);
	while (($k, $t) = each(%keys)) {
	    foreach $j (keys %$t) {
		my $orig = $DB{"$j$k"};
		next if $stopk{$j} && $stop{$k};
			# The limit of 400 bytes is enough to accomodate the
			# largest thesaurus categories (Roman letterals).
		if ($stopk{$j} && length($orig) > 400) {
		    $stop{$k}++;
		    my $e;
		    foreach $e (keys %stopk) {
			$orig = $DB{"$e$k"};
			delete $DB{"$e$k"};
			next if $orig eq '';
			my $nk = scalar(split(/ /, $orig));
			$nkeys--;			# This key is removed
			$ntgts -= $nk;
		    }
		} elsif ($orig eq '') {
		    $DB{"$j$k"} = $posn;
		    $nkeys++;				# This is a new key
		    $ntgts++;
		    push(@added, "$j$k");
		} else {
		    $DB{"$j$k"} .= " $posn";
		    $ntgts++;
		    push(@added, "$j$k");
		}
	    }
	}
    } continue {
	$posn = tell DB;
    }
    print STDERR "\nRebuild finished.  $lines words, $nkeys keys, $ntgts targets\n";
#   while (($k, $t) = each(%DB)) {
#	print join("\t", $k, scalar(split(/ /, $t)), length($t)), "\n";
#   }
    exit 0;
}
__DATA__