#!/bin/sh
############################################################
#        SCRIPT FOR ITALIAN PART OF SPEECH TAGGING
# for Helmut Schmid's TreeTagger with Achim Stein's lexicon
############################################################
# Achim Stein
# Universitaet Stuttgart, Institut fuer Linguistik-Romanistik
# Keplerstrasse 17, D-70174 Stuttgart
# e-mail: achim@ims.uni-stuttgart.de
# October 1997
###########################################################################

# THESE VARIABLES HAVE TO BE SET:

BIN=./bin
LIB=./lib

# set your default options for the Tree Tagger
TAGGEROPTS="-token -lemma -sgml"

# set the path for the tagger command
TAGGERCMD=${BIN}/tree-tagger

# set the path for the parameter file
PARAMETERS=${LIB}/italian.par

############################################################
# The script requires gawk (we used V2.15), tr, grep,
# the tree-tagger and the parameter file.
# 
# Input Files have to be in ISO-Latin-1, SGML-Codes (if any)
# should be surrounded by carriage returns.
#
# WHAT THIS SCRIPT DOES:
#
# 1. PRE-PROCESSING:
# The tokenization converts italian text into a one-word-per-line
# format which complies with our lexicon entries and with the
# Parameter File(s) we distribute.
#
# 2. TAGGING with the options defined below or on the command line
#
###########################################################################

HELP=0

while getopts ho: myopts
do case $myopts in
   h) HELP=1;;
   o) TAGGEROPTS="$OPTARG";;
   esac
done
shift `expr $OPTIND - 1`


if [ $# -eq 1 ]
then INPUT=$1
else INPUT="$*"
fi

if [ $HELP -gt 0 ] || [ $# -gt 1 ];  then
cat << EOM
tree-tagger-italian [-o 'TreeTagger options'] [input]
- tokenizes and morphologically analyzes Italian texts
- reads from stdin (unless input is specified), writes to stdout
- requires Helmut Schmid's TreeTagger
- requires Gnu Awk (Tested with gawk version 2.15, patchlevel 4)
- Valid TreeTagger Options are:
EOM
$TAGGERCMD
exit
fi

####### Tokenization for Italian texts
cat -s $INPUT |\

gawk '
# SGML-Codes
/^<.*>$/ { gsub(/ /, "~"); print; next }
/<.*>/ {
   gsub(/>/, "> ")
}
{
# cut punctuation off
  gsub(/\047/, "\047 ")
  gsub(/  *%/, "%")
  gsub(/\.\.\./, " ___ ")
  gsub(/"/, " & ")
  gsub(/[\.,;:!\?\)\]]/, " &")
  gsub(/[\(\[]/, "& ")
  gsub(/___/, "...")
  gsub(/\#/, "")
  gsub(/\253/, "<< ")
  gsub(/\273/, " >>")
  gsub(/---?/, " - ")
# Strip leading and trailing spaces
  gsub(/^ */, "")
  gsub(/ *$/, "")
}
{ print }' |\

# One word per line

tr ' ' '\12' |\
grep -v '^$' |\


############# Handle exceptions:

gawk '

# Abbreviations which are in the lexicon/parameter file)
$0~/^\.$/ && p1~/^\*?(\..|L|Lit|art|lett|no?|pagg?|prot|tel)$/ { append(N) }

# Abbreviations of type "U.e.f.a."
$0~/^\.[^\.]/ { append(N) }
# decimal numbers
$0~/^,[0-9]+/ && p1~/[0-9]+/ { append(N) }
# append series of numbers (e.g. 300 000)
$0~/^[0-9]+$/ && p1~/^[0-9]+$/ { append(N) }

# append punctuation before punctuation (not a good idea)
# $0~/^\.$/ { 
#   getline f1
#   if(match(f1,/[\.!\?:,;]/) == 1) append2(N)
#     else printf "\n%s\n%s", $0, f1; next
#  }

# print the remaining cases
{printf "\n%s", $0; stack(N)}
END {printf "\n"}

function append(N) {
  printf"%s", $0
  stack(N); next
}
function append2(N) {
  printf"_%s\n%s", $0, f1
  p1=$0; $0=f1
  stack(N); next
}
function stack(N) {
  p1=$0
}
' |\

$TAGGERCMD $PARAMETERS $TAGGEROPTS