#!/usr/local/bin/perl # Copyright © 2001 by James F. Carter # Permission is granted for any use provided this copyright notice # remains intact. Specifically, if you want to hack this for another language, # feel free. # Issues: # % Suppress 1-letter "words". # . Unsuppress letterals. # . Provide for incoming and outgoing xrefs. # . Multi-word queries # . Order the hits by file position. # % What is "modal~test" doing as a key? (from English) # % What is 2.4.5,4.6.3 doing as a key? # % Only kill keys from stoppable areas. (Leave g-vo). # % Killing seems not to actually kill anything. # CGI script to search the xankua database. # Put this in your HTML document (suitably formatted) (note the field names): #
# # # Regions to search: # Gua\spi word # Words xreffing this one # Definition # English keywords # Thesaurus category # All fields # Show cross references # # #
# The "referer" field is for the URL of the original referrer, i.e. the form # that called this script, in case this script calls itself multiple times. # The program attempts to include this referrer document at the end of the # output, provided a file of the same basename exists in the same directory as # the script. # The database is sought in the same directory where the script is. # If a command line argument of -R is given, the database is rebuilt. # Filenames: # xankua.dat Text table # xankua.{dir,pag} DBM version of database # As presently programmed, the file has 1756 lines (including thesaurus # categories), 12452 keys, 21528 targets. # Command line arguments (must be in this order): # -f basename Basename of database (for debug) # -R Rebuild database $basename = "xankua"; if ($ARGV[0] eq "-f") { $basename = $ARGV[1]; splice(@ARGV, 0, 2); } $rebuild = ($ARGV[0] eq "-R"); # Compute the name of the database $cgi = ($ENV{GATEWAY_INTERFACE} ne ''); $script = $cgi ? $ENV{SCRIPT_FILENAME} : $0 =~ /\// ? $0 : "./$0"; $dir = $script; substr($dir, rindex($dir, '/')) = ''; $dbname = "$dir/$basename"; # Redirect stderr to stdout. open(STDERR, ">&STDOUT") or die "Can't redirect STDERR to STDOUT: $!\n" if $cgi; # Compute name of the HTML file. $htmlfile = $script; substr($htmlfile, rindex($htmlfile, '.')) = ".html"; open(DATA, $htmlfile) || die "Failed to open HTML file $htmlfile: $!\n" if -r $htmlfile && !$rebuild; # Print standard header area. Print HTML file up to separator. if (!$rebuild) { print "Content-type: text/html\n\n"; $sep = "\n"; while () { print; last if $_ eq $sep; } } # Open the database. open(DB, "$dir/$basename.dat") or die "Can't open $dir/$basename.dat: $!\n"; use SDBM_File; #Do it here so error messages are printed use Fcntl; $DB = "$dir/$basename"; @DB = ("$DB.dir", "$DB.pag"); unlink @DB if $rebuild; # tie(%DB, 'GDBM_File', $DB, # ($rebuild ? &GDBM_WRCREAT : &GDBM_READER), 0644) tie(%DB, 'SDBM_File', $DB, ($rebuild ? (O_RDWR | O_CREAT) : O_RDONLY), 0644) or die "Can't open database $DB: $!\n"; # Timing data (on 500 MHz Pentium III): GDBM, ODBM, NDBM appear # to be identically implemented. SDBM is slightly different. # Data about GDBM SDBM # Words 1756 1756 (Base file records) # Keys 12147 12147 (DB records) # Targets 20531 20531 (Seek ptrs in DB recs) # Base file size 187520 187520 bytes # DBM file size 991232 863232 bytes # Time to rebuild 3.53 3.01 secs # Time 1.72e-4 1.46e-4 sec/target # Time to rebuild the database? &rebuild() if $rebuild; # Extract the query parameters into hash %q. # HTTP encoding changes spaces to + signs; change it back. # No need to translate other special codes. foreach $unit (split(/\&/, $ENV{QUERY_STRING})) { ($k, $v) = split("=", $unit); $q{$k} = &decodevalue($v); } $q{referer} = $ENV{HTTP_REFERER} unless $q{referer} ne ''; # Which areas are to be searched? BEGIN { %area = qw(g word t thes e engl d definition x xref); %areax = (g => "-Gua\\spi", x => "Cross references", d => "Definition", e => "English keywords", t => "Thesaurus category", a => "All fields"); } foreach $e (qw(g x d e t a)) { next if $q{$e} eq ''; push(@area, $e); push(@areax, $areax{$e}); } $areax = join(', ', @areax); unless (@area) { print "

No area was selected; using word and definition.\n"; @area = qw(g d); $areax = "(Missing)"; } # Print the query as received. print < Query: Word or Category $q{word} Area to Search $areax Max Results $q{max} EOF unless ($q{max} > 0) { print "

Maximum results of `$q{max}' must be > 0; using 10.\n"; $q{max} = 10; } unless ($q{word} ne '') { print "

Search word is null. No results.\n"; &finish(); } # Regexp to recognize the sought words $regexp = quotemeta(join(' ', &normalize($q{word}))); $regexp =~ tr/ /|/; # Do the queries. The subrt knows to skip duplicates. QUERY: foreach $w (split(/[,\s]+/, $q{word})) { foreach $e (@area) { foreach $posn (split(' ', $DB{"$e$w"})) { &doquery($posn) or last QUERY; } } } # If cross references were requested, do that now. if ($q{xref} ne '') { my %xref; while (($posn, $rec) = each(%hits)) { foreach $k (split(/[,\s]+/, $rec->{xref})) { $xref{$k}++; } } XREF: foreach $w (keys %xref) { foreach $posn (split(' ', $DB{"g$w"})) { &doquery($posn) or last XREF; } } } # Guts of the query process. Argument: a seek position. sub doquery { my($posn) = @_; if (!exists($hits{$posn})) { seek(DB, $posn, 0) or do { print STDERR "

Failed to seek to `$posn': $!\n"; return 0; }; $_ = readline(DB); # Read the record for that word. # Matching words in bold face s/\b($regexp)\b/$1<\/B>/igo if $regexp ne ''; $hits{$posn} = &munpack($_); } $hits{$posn}{score}++; 1; } # Organize the results, most hits first, then ordered by file # position, except thesaurus categories first. @hits = sort { ($hits{$b}{class} eq 't') <=> ($hits{$a}{class} eq 't') || $hits{$b}{score} <=> $hits{$a}{score} || $a <=> $b } keys %hits; printf "

%d matching records found.\n", scalar(@hits); if (@hits > $q{max}) { print "Showing the first $q{max}.\n"; splice(@hits, $q{max}); } # Provide the column headers (if there are any results) if (@hits > 0) { unshift(@hits, -1); $hits{"-1"} = &munpack("Word\tCl\tRank\tCategory\tEnglish\tPhonetic\t \t \t \t\t\t\n"); } # Report the results. print '', "\n"; foreach $posn (@hits) { my $rec = $hits{$posn}; # Extract the "foreign" language translations my $xl; foreach $t (qw(chinese latin loglan)) { $v = $rec->{$t}; $xl .= ucfirst($t) . " $v, " if $v ne '' && $v ne '-'; } $xl = ' (' . substr($xl, 0, -2) . ')' if $xl ne ''; $xl =~ s/(?<=\w) +(?=[,)])//g; print <
${$rec}{word} ${$rec}{thes} (${$rec}{class}) ${$rec}{engl} $xl EOF print "
${$rec}{definition}\n" if ${$rec}{definition} ne ''; print "
Note:${$rec}{comments}\n" if ${$rec}{comments} ne ''; print "
See also:${$rec}{xref}\n" if ${$rec}{xref} ne ''; } print "
\n$emsg"; &finish(); # End of main thread. # Appends the rest of the form to the output, and exits. sub finish { while () { s/value=\".*\"/value=\"$q{referer}\"/ if /name=referer/; print; } exit 0; } # HTML-encodes values that may contain control characters. sub xformvalue { my($value) = @_; my($unit, $c); #Translate all control characters except ^J (\n) my(@value) = split(/([\0-\011\013-\037])/, $value); $value = ''; while (($unit, $c) = splice(@value, 0, 2)) { $value .= $unit; if ($c ne '') { $value .= '^'; $value .= chr(ord($c)+0100); } } return $value; } # Inverts the HTTP-encoding process. sub decodevalue { my($val) = @_; $val =~ tr/+/ /; # Spaces are changed to + signs. my @val = split('%', $val); my @v2 = shift @val; foreach $_ (@val) { substr($_, 0, 2) = chr(oct("0x" . substr($_, 0, 2)));v } join('', @v2, @val); } # Unpack a database record. Args: \%record, $data. BEGIN { @fields = qw(word class rank thes engl phon chinese latin loglan xref definition comments); # word -gua\spi word # class # P = Phrase relative pronoun (modal pronouns are S's), Letteral # S = Pure structure word (caselink, digit, etc.) # p = A noun in English # q = Relational, but not a verb in English # r = Relation # t = Category header # rank (not used) # thes Thesaurus category # engl Comma separated English keywords # phon Phonetic English (for making words)a # chinese Phonetic Chinese (for making words)a # latin Phonetic Latin (for making words)a # loglan Old Loglan word # xref Cross reference to other -gua\spi words # definition # comments } sub munpack { my($data) = @_; chomp $data; my $rec = { }; @{$rec}{@fields} = split("\t", $data); $rec; } # Splits the argument into words after doing these transformations: # Arguments starting with X are removed. Everything becomes lower case. # Punctuation (e.g. tone symbols in Chinese) is removed. # $bits: 1 = letteral (don't lose 1-byte codes); 2 = thesaurus category # (don't lose punctuation). sub normalize { my($data, $area) = @_; $data =~ s/X\S+//g; # Lose arguments (but not letteral X) $data = lc($data); # Everything becomes lower case $data =~ tr/,~/ /; # Change comma and tilde into word separators unless ($area eq 't') { $data =~ s/[^\w\s]//g; # Lose other punctuation $data =~ s/\b\d+\b//g; # Lose trashed thesaurus categories $data =~ s/\b\w\b// unless $area eq 'd'; #Lose 1-byte words } split(' ', $data); # Split, losing all whitespace } # Rebuilds the database. # (guaspi, cognates, definition, category, all) sub rebuild { my($k, $t, $j); my %stop = qw(is 1 a 1 the 1 and 1 by 1 to 1 of 1 in 1 for 1 case 1 at 1 that 1 as 1 about 1 do 1 but 1 as 1 no 1 can 1 it 1 has 1 are 1 eg 1 its 1 ); #Stop list my %stopk = qw(d 1 c 1 a 1); # Stops apply to these areas my $nkeys = 0; # Number of keys my $ntgts = 0; # Number of targets my $dots = 50; # Print a dot every N lines my $lines = 0; # Number of words indexed select STDERR; print STDERR "Rebuilding database: \n"; $| = 1; my($posn) = tell DB; while () { if (++$lines % $dots == 0) { print STDERR '.'; } next if substr($_,0,1) eq '!'; chomp; my $rec = &munpack($_); my %keys; my @areas; if (${$rec}{class} eq 't') { # For a thesaurus category, hit on just the category, its # first 2 components, and the title {member engl}. next unless ${$rec}{thes} =~ /^\d+\.\d+\.\d+$/; my $t = ${$rec}{thes}; $t =~ s/\.\d+$//; $keys{$t}{t}++; push(@areas, 't'); } else { # For any other, hit on the word, the thesaurus category, # the English keywords, definition, and cross reference. @areas = qw(g t e d x); } foreach $e (@areas) { foreach $k (&normalize(${$rec}{$area{$e}}, $e)) { next if $stopk{$e} && $stop{$k}; $keys{$k}{$e}++; } } # Get the keys for the "all" category. Skip any gotten so far. foreach $k (&normalize($_)) { $keys{$k}{a}++ unless $stop{$k} || exists($keys{$k}); } # Add the seek pointer for this record to each key my(@added); while (($k, $t) = each(%keys)) { foreach $j (keys %$t) { my $orig = $DB{"$j$k"}; next if $stopk{$j} && $stop{$k}; # The limit of 400 bytes is enough to accomodate the # largest thesaurus categories (Roman letterals). if ($stopk{$j} && length($orig) > 400) { $stop{$k}++; my $e; foreach $e (keys %stopk) { $orig = $DB{"$e$k"}; delete $DB{"$e$k"}; next if $orig eq ''; my $nk = scalar(split(/ /, $orig)); $nkeys--; # This key is removed $ntgts -= $nk; } } elsif ($orig eq '') { $DB{"$j$k"} = $posn; $nkeys++; # This is a new key $ntgts++; push(@added, "$j$k"); } else { $DB{"$j$k"} .= " $posn"; $ntgts++; push(@added, "$j$k"); } } } } continue { $posn = tell DB; } print STDERR "\nRebuild finished. $lines words, $nkeys keys, $ntgts targets\n"; # while (($k, $t) = each(%DB)) { # print join("\t", $k, scalar(split(/ /, $t)), length($t)), "\n"; # } exit 0; } __DATA__