teisort.pl

#!/usr/bin/perl -w

# V1.1 6/2002 by Guido Ostkamp, <Guido.Ostkamp@t-online.de>
#  * more dictd-like sorting with by_dict_sort()
#
# V1.0 4/2002 by Michael Bunk, <kleinerwurm@gmx.net>
#  * does some sorting of a tei-file, but without parser (worked out on xml,
#    but might be fine with sgml - i didn't try it)
#  * problem: xml-parser would not tell position, would it?
#  * so let's do without parser ;-)
#  * using some undefined collation order
#  * i wrote this to be able to merge double entrys
#  * sort with an in-memory index of
#    - keyword (first orth)
#    - byte-start-offset of entry in tei file (end is found by
#      outputting until </entry>)
#  * we could try DB_File (see manpage), but let's try simple hash first
#     key: <orth>-characters
#     value: struct <entry>-offset

# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, visit
# <http://www.gnu.org/licenses/gpl-2.0.txt>

use strict;

sub by_dict_sort {
 my $x = $a;
 my $y = $b;

 $x =~ s/[^\d\w\s]//gi; # remove all non-alphanumeric and non-whitespace
 $x =~ s/(.*)/\L$1/gi;  # turn lowercase

 $y =~ s/[^\d\w\s]//gi; # remove all non-alphanumeric and non-whitespace
 $y =~ s/(.*)/\L$1/gi;  # turn lowercase

 $x cmp $y
 }

my $file = shift;

unless(defined $file) {
 print STDERR "\nteisort - sort tei file without using any xml parser (not safe)\n";
 print STDERR "\n The inputfile is expected in XML or SGML TEI format, see http://www.tei-c.org/\n";
 print STDERR " Output is on stdout.\n\n";
 print STDERR " Usage: teisort <teifile>\n";
 print STDERR " <teifile> : name of tei inputfile\n\n";
 die
 }

die "Can't find file \"$file\"" unless -r $file;

open HANDLE, "<".$file;

print STDERR "Generating index in memory...\n";
my ($headend, $footstart, $offset, $entry, %orths, $orth, $counter,
 $tell_now, $tell_lastline);

# when we find "<entry" then we keep on reading until "</entry>",
# but save everything inside a $entry (we could keep the whole text
# in the hash!!! [this is just a notice for future reference
# for myself]). then we look for the "<orth*>$1</orth>" and use
# $1 as key
my $todo = "";
my $offsetOnLine = 0;

my $searchmode = 0;
# 0 = find end of header by finding "<entry"
# 1 = find beginning of entry by looking for "<entry"
#     or find end by looking for "</body>"
# 2 = find end of entry by looking for "</entry>"
# 3 = nothing more to find, we are inside the footer, break;
# input is taken from $todo !

readfile: while (<HANDLE>) {
 $tell_lastline = $tell_now;
 $tell_now = tell;
 $todo .= $_;

 while ($todo ne "") {

  # find end of header
  if (($searchmode == 0) && ($todo =~ /<entry/i)) { #i for case insensitivity
   $searchmode = 1;

   my $eoffset = index "<entry", $todo;
   $headend = $tell_lastline + $eoffset; print STDERR "headoffset: $headend\n";

   $offsetOnLine -= $eoffset;
   $todo = substr $todo, 0, $eoffset;
   next
   }

  # find beginning of entry
  if (($searchmode == 1) && ($todo =~ /<entry/i)) {
   $searchmode = 2;

   my $eoffset = index "<entry", $todo;
   $offset = $tell_lastline + $eoffset;

   $entry = substr $todo, $eoffset;

   $counter++; if ($counter % 100 == 0) { print STDERR " $counter entries\n" }

   $offsetOnLine -= $eoffset;
   $todo = substr $todo, 0, $eoffset;
   next
  }

  # find footer
  if (($searchmode == 1) && ($todo =~ /<\/body>/i)) {
   $searchmode = 3;

   my $eoffset = index("<\/body>", $todo);
   $footstart = $tell_lastline + $eoffset; print STDERR "footoffset: $footstart\n";
   #$offsetOnLine -= $eoffset;
   $todo = "";#substr($todo,0,$eoffset);
   last readfile #exit that loop
  }

  # find end of entry
  if (($searchmode == 2) && ($todo =~ /<\/entry>/i)) {
   $searchmode = 1;

   my $eoffset = index "</entry>", $todo;
   $entry .= substr($todo, 0, 8+$eoffset) . "\n";

   $offsetOnLine -= $eoffset;
   $todo = substr $todo, 0, $eoffset;

   # find orth
   # /s modifies to treat $entry as single line
   if ($entry =~ /<orth.*>(.*)<\/orth>/s) {
    $orth = $1;
    #print STDERR "orth: '$orth'\n";
    }
   else {
    #print STDERR ".";
    warn "no orth found in entry!!! there is something wrong! Entry is <$entry>"
    }

   # we may not overwrite any pair in the hash that we already have
   # but since the entry-elements are read from the tei file again,
   # the " *" is never seen in the output :)
   while(defined $orths{$orth}) { $orth .= " " }

   # save in hash
   $orths{$orth}=$offset;

   next
   }

  # else
  $entry .= $todo;
  $todo = "";
  $offsetOnLine = 0
  }
 }

print STDERR " $counter entries\n";

###############################################################

print STDERR "Outputting sorted entries...\n";

# output header
my $header;
die unless sysseek HANDLE, 0, 0;
sysread HANDLE, $header, $headend;
print $header;

$counter = 0;

# this one simple sort call does the keywork!
foreach $orth (sort by_dict_sort keys %orths) {

 $counter++; if ($counter % 100 == 0) { print STDERR " $counter entries\n" }

 $offset = $orths{$orth};
 #print STDERR "offset: $offset\n";

 sysseek HANDLE, $offset, 0;

 # output until </entry>
 my $stopword = "</entry>";
 my $stopwordpos = 0;
 my $stopwordlength = length $stopword;
 my $c;
 do {
  sysread HANDLE, $c,1;# maybe sysread with more than one byte would be faster...
  print $c;
  if ($c eq substr($stopword, $stopwordpos, 1)) { $stopwordpos++ }
  else { $stopwordpos = 0 }
  }
 until $stopwordpos == $stopwordlength;

 } # foreach

# output footer
my $footer;
die unless sysseek HANDLE, $footstart, 0;
my @stats = stat HANDLE;# fetch tei filesize
die unless @stats;
sysread(HANDLE, $footer, $stats[7]-$footstart);
print $footer;

print STDERR " $counter entries\n";
close HANDLE