#!/usr/bin/ruby doc=<<"DOC" DOC require '~/Ruby/cmd_line.rb' BIN='/Applications/tree-tagger-MacOSX-3.1/bin/' CMD='/Applications/tree-tagger-MacOSX-3.1/tagger-scripts/cmd/' LIB='/Applications/tree-tagger-MacOSX-3.1/tagger-scripts/lib/' TOKENIZER=BIN+'separate-punctuation' TAGGER=BIN+'tree-tagger' ABBR_LIST=LIB+'german-abbreviations' PARFILE=LIB+'german.par' FILTER=CMD+'filter-german-tags' # The +s switch seems to activate separation of punctuation. # I'll bet +l activates abreviation processing. # What does +1 do? def make_prolog(line) word, tag, lemma = line.split "word('%s', '%s', '%s')," % [word, tag, lemma] end long_switches = ['pro', 'tok', 'lemmas', 'words'] needs_switches = [] $cl = Command_line.new('h', doc, long_switches, needs_switches) args=$cl.rest args.each do |file_name| if $cl['tok'] `#{TOKENIZER} +1 +s +l #{ABBR_LIST} #{file_name}`.each do |ln| puts ln end elsif $cl['pro'] text=[] sentence=[] puts "#{TAGGER} #{PARFILE} #{file_name}" `#{TOKENIZER} +1 +s +l #{ABBR_LIST} #{file_name} | #{TAGGER} #{PARFILE} -token -lemma`.each do |ln| word, tag, lemma = ln.split sentence << ("word('%s', '%s', '%s')" % [word, lemma, tag]) if word == '.' text << '['+sentence.join(",\n ") + ']' sentence = [] end end puts '[' + text.join(",\n ") +'].' else puts "#{TAGGER} #{PARFILE} #{file_name}" `#{TOKENIZER} +1 +s +l #{ABBR_LIST} #{file_name} | #{TAGGER} #{PARFILE} -token -lemma`.each do |ln| if $cl['lemmas'] puts ln.split[2] elsif $cl['words'] puts ln.split[0] else puts ln end end end end