Friday, June 26, 2015

Natural Language Processing with Python: Chapter 6 Excercise Answers

#ex1
#Too descriptive

#ex 2
def gender_features(word):
    return {
        'suffix1': word[-1:],
        'suffix2': word[-2:],
        'startswith': word[0].lower(),
        'length':len(word),
        'first2char':word[0:2].lower(),
        'containsyn':'yn' in word
        }

names = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)
featuresets = [(gender_features(n), g) for (n,g) in names]

train_set = nltk.apply_features(gender_features, names[500:])
devtest_set = nltk.apply_features(gender_features, names[500:1000])
test_set = nltk.apply_features(gender_features, names[1000:len(names)])

classifier = nltk.NaiveBayesClassifier.train(train_set)

p(nltk.classify.accuracy(classifier, test_set)) # 81% accuracy
p(classifier.show_most_informative_features(5))



Tuesday, June 23, 2015

Natural Language Processing with Python: Chapter 2 Answers


import pprint
from nltk.corpus import wordnet as wn
import nltk
from nltk.corpus import *
from nltk.corpus import state_union
from matplotlib import pylab

def p(x):
    pprint.pprint(x)

#ex 1
sent = "The quick brown fox jumps over the lazy dog"
phrase = sent.split()
phrase.append('extra words') #addition
for w in phrase:
   p(phrase.index(w)) #indexing
p(phrase*2) #multiplication
p(phrase[2:5]) #slicing
p(sorted(map(lambda x: x.lower(),phrase)))


#ex 2
austen = gutenberg.words('austen-persuasion.txt')
p(len(map(lambda x: x.lower(),austen)))
p(len(set(map(lambda x: x.lower(),austen))))


#ex 3
brown_categories = brown.categories()
p(brown.words(categories = [brown_categories[1], brown_categories[2]])[100:])

#ex 4
def tabulate(cfdist, words, categories):
    print '%-16s' % 'Category',
    for word in words:
        print '%6s' % word,
    print
    for category in categories:
        print '%-16s' % category,
        for word in words:
            print '%6d' % cfdist[category][word],
        print

cfd = nltk.ConditionalFreqDist(
    (fileid, word)
    for fileid in state_union.fileids()
    for word in state_union.words(fileid))

tabulate(cfd, ['men', 'women', 'people'], state_union.fileids())

#ex 5
p(wn.synset('book.n.01').part_holonyms())
p(wn.synset('book.n.01').substance_holonyms())
p(wn.synset('book.n.01').member_holonyms())
p(wn.synset('book.n.01').part_meronyms())
p(wn.synset('book.n.01').substance_meronyms())
p(wn.synset('book.n.01').member_meronyms())

#ex 6
Circular translations could result in inaccuracies, even errors. So while translating from one language to another and then translating back, comparing with other languages could be helpful to reduce imperfections.

#ex 7
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
p(emma.concordance('however'))
brn = nltk.Text(nltk.corpus.brown.words('ca01'))
p(brn.concordance('however'))
cht = nltk.Text(state_union.words(state_union.fileids()[0]))
p(cht.concordance('however')) #mistake in the usage of however!


#ex 8
cfd = nltk.ConditionalFreqDist(
    (fileid, name[0])
    for fileid in names.fileids()
    for name in names.words(fileid))
cfd.plot()


#ex 9
#?

#ex 10
#?

#ex 11
#?

#ex 12
cmd  = nltk.corpus.cmudict.dict()
p(len(set(map(lambda x: x.lower(),cmd)))) #123455
ctr = 0
for k in cmd.keys():
    if(len(cmd[k]) > 1):
        ctr = ctr+1
p(ctr) # 9241

#ex 13
ctr = 0
als = list(wn.all_synsets('n'))
for w in als:
    if(len(w.hyponyms()) == 0):
        ctr = ctr + 1
p(ctr/len(als)) #0.7967119283931072

#ex 14
def supergloss(s):
    res = s.definition() +"\n"
    for w in s.hyponyms():
        res += ' '+ str(w) + ' '+ w.definition() + " \n"
    for w in s.hypernyms():
        res += ' '+ str(w) + ' '+ w.definition() + " \n"
    return res
p(supergloss(wn.synset('dog.n.01')))

#ex 15
p((lambda  x: x in brown.words() and brown.words().count(x) >=3, brown.words()))

#ex 16
def lexical_diversity(text):
    return len(text) / len(set(text))
cfd = nltk.ConditionalFreqDist(
    (category, lexical_diversity(nltk.Text(brown.words(categories=category))))
    for category in brown.categories())
cfd.tabulate()


#ex 17
def fifty_most(words):
    content = [w for w in words if w.lower() not in stopwords.words('english')]
    return nltk.FreqDist(content).most_common(50)

#ex 18
def fifty_most_bigrams(words):
    content = [w for w in words if w.lower() not in stopwords.words('english')]
    return nltk.FreqDist(nltk.bigrams(content)).most_common(50)

#ex 19
cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre))

genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
words = ['people', 'earth', 'country', 'science', 'sports', 'space', 'love']
cfd.tabulate(conditions=genres, samples=words)

#news category mostly deals with people and country, religion does not talk about science at all, sports is mostly a hobby, romance of course talks about love. Humor, as usual does not give much of a shit about anything.

#ex 20
#section?

#ex 21
d = cmudict.dict()
def nsyl(word):
  return [len(list(y for y in x if isdigit(y[-1]))) for x in d[word.lower()]]


#would be updated