#!/usr/bin/perl

use warnings;
use strict;

use utf8;
use Encode;

if (@ARGV != 1) {
  die "Usage: $0 <config>\n";
}

my %mappings = ();
my $cfg = $ARGV[0];
readconfig($cfg);

my $pwd = `pwd`;
my $path= `dirname $0`;
chomp $pwd;
chomp $path;
if ($path =~ m/^\.\/(.*)$/) {
  $path="${pwd}/$1";
}

if ($path !~ m/^\/.*$/) {
  if ($path eq ".") {
    $path="${pwd}";
  } else {
    $path="${pwd}/$path";
  }
}

print <<EOF;
<?xml version="1.0" encoding="utf-8"?>
EOF

while (<STDIN>) {
  my $line = $_;
  chomp $line;
  $line = decode('UTF-8',$line);
  while ($line =~ m/<text>(.*?)<\/text>/g) {
    my $text = $1;
    print "<segment>\n";
    my $cp = 0;
    while ($text =~ m/(\S+)/g) {
      my $tok = $1;
      my $cfrom = index($text,$tok,$cp);
      my $cto = $cfrom + length $tok;
      $cp = $cto;
      my $res = `echo "$tok" | iconv -f utf8 -t windows-1251 | $path/mystem -ni | iconv -f windows-1251 -t utf8`;
      if ($res =~ m/^\s*$/) {
        $tok = encode('UTF-8', $tok);
        print "  <token form=\"$tok\" from=\"$cfrom\" to=\"$cto\">\n";
        print "    <analysis stem=\"$tok\">\n";
        print "    </analysis>\n";
        print "  </token>\n";
      } else {
        processtoken($res,$cfrom,$cto);
      }
    }
    print "</segment>\n";
  }
}


sub readconfig {
  my ($cfg) = @_;
  open (my $inf, $cfg) or die "Cannot open mapping configure file $cfg.\n";
  while (<$inf>) {
    my $line = $_;
    chomp $line;
    if ($line =~ m/^\s*;/ ||
        $line =~ m/^\s*$/) {
      next;
    }
    if ($line =~ m/^\s*(\S+)\s*:=\s*(\S+)\s*\.\s*$/) {
      $mappings{$1}=$2;
    } else {
      die "Invalid configuratoin line: $line\n";
    }
  }
  close $inf;
}

sub processtoken {
  my ($tokres,$cfrom,$cto) = @_;

  if ($tokres =~ m/^([^{]+)\{(.*)\}/) {
    my $form = $1;
    print "  <token form=\"$form\" from=\"$cfrom\" to=\"$cto\">\n";
    my $analysesstr = $2;
    my @analyses = split(/\|/,$analysesstr);
    foreach my $analysis (@analyses) {
      if ($analysis =~ m/^(.*)=(.*)=(.*)$/) {
        my $stem = $1;
        my $lexinfo = $2;
        my $morphinfo = $3;
        if ($stem =~ /^(.*)\?$/) {
          $stem = $1;
        }
        $lexinfo =~ m/^([^,]+)/ or die "Invalid lexinfo $lexinfo\n";
        my $pos = $1;
        print "    <analysis stem=\"$stem\">\n";
        my @mtags = split(/,/,$morphinfo);
        foreach my $mtag (@mtags) {
          if (exists $mappings{$pos."+".$mtag}) {
            print "      <rule id=\"".$mappings{$pos."+".$mtag}."\" form=\"$form\"/>\n";
          }
        }
        print "    </analysis>\n";
      } elsif ($analysis =~ m/^(.*)\?\?$/) {
        my $stem = $1;
        print "    <analysis stem=\"$stem\">\n";
        print "    </analysis>\n";
      }
    }

    print "  </token>\n";
  }
}
