#!/bin/sh

# Set these paths appropriately

BIN=./bin
CMD=./cmd
LIB=./lib

TOKENIZER=${BIN}/separate-punctuation
TAGGER=${BIN}/tree-tagger
ABBR_LIST=${LIB}/english-abbreviations
PARFILE=${LIB}/english.par

# put all on one line
cat $* |
# do tokenization
$TOKENIZER +1 +s +l $ABBR_LIST |
# separate clitics from preceding words
sed -e "s/'s"'$'"/ 's/g" \
-e "s/s'"'$'"/ '/g" \
-e "s/n't"'$'"/ n't/g" \
-e "s/'re"'$'"/ 're/g" \
-e "s/'ve"'$'"/ 've/g" \
-e "s/'d"'$'"/ 'd/g" \
-e "s/'m"'$'"/ 'm/g" \
-e "s/'em"'$'"/ 'em/g" \
-e "s/'ll"'$'"/ 'll/g" \
-e '/^$/d' |
tr ' ' '\n' |
# remove empty lines
grep -v '^$' |
# tagging
$TAGGER $PARFILE -token -lemma -sgml