import shelve, re
import subprocess, sys  
from urllib.request import urlopen

def makeBigrams(file):
    store = dict()
    with open(file) as f:
        r = [ w for w in re.split('[\W_]', f.read()) if w != '']
        for bigram in [r[i] + ' ' + r[i+1] for i in range(len(r)-1) ]:
            clean = [word for word in re.split('[\W_]', bigram) if word != '']
            print(clean)
            bi = clean[0] + ' ' + clean[1]
            if bi not in store:
                store[bi] = 1
            else:
                store[bi] = store[bi] + 1
    return store



def makeBigrams2(file):
    store = dict()
    with open(file) as f:
        r = [ w for w in re.split('[\W_]', f.read()) if w != '']
        for bigram in [r[i] + ' ' + r[i+1] for i in range(len(r)-1) ]:
            clean = [word for word in re.split('[\W_]', bigram) if word != '']
            first, second = clean[0] , clean[1]
            if first not in store:
                store[first] = {second:1}
            elif second in store[first]:
                store[first][second] = store[first][second] + 1
            else:
                store[first][second] = 1
    return store

def makeBigrams3(file):
    out = "bigrams_" + file
    subprocess.call("tr 'A-Z' 'a-z' < " + file + " > tmp", shell=True)
    subprocess.call("tr -sc 'a-zA-Z' '\012' < tmp > tmp1",shell=True)
    subprocess.call("tail +2 tmp1 > tmp2",shell=True)
    subprocess.call("paste tmp1 tmp2 | sort | uniq -c > " + out,shell=True)
    subprocess.call("rm tmp1 tmp2",shell=True)
    return out

pg_uris= [
    "http://www.gutenberg.org/files/20795/20795.txt",
    "http://www.gutenberg.org/dirs/etext97/hpaot10.txt",
    "http://www.gutenberg.org/files/12/12.txt",
    "http://www.gutenberg.org/dirs/etext05/7brdz10.txt",
    "http://www.gutenberg.org/dirs/etext04/nthtt10.txt"
]

def crawlGutenbergCorpus(output="bigcorpus", urls=pg_uris): 
    for uri in urls:  
        with urlopen(uri) as inp:
            print("Reading " + uri)
            with open(output, 'a') as out:     
                content = False
                for l in inp:
                    line = str(l)
                    if content:
                        out.write(line+ "\n")
                    elif "*** START OF THE PROJECT GUTENBERG EBOOK" in line:
                        content = True


if __name__ == "__main__":
    crawlGutenbergCorpus()       
