import pprint
from nltk.corpus import wordnet as wn
import nltk
from nltk.corpus import *
from nltk.corpus import state_union
from matplotlib import pylab
def p(x):
pprint.pprint(x)
#ex 1
sent = "The quick brown fox jumps over the lazy dog"
phrase = sent.split()
phrase.append('extra words') #addition
for w in phrase:
p(phrase.index(w)) #indexing
p(phrase*2) #multiplication
p(phrase[2:5]) #slicing
p(sorted(map(lambda x: x.lower(),phrase)))
#ex 2
austen = gutenberg.words('austen-persuasion.txt')
p(len(map(lambda x: x.lower(),austen)))
p(len(set(map(lambda x: x.lower(),austen))))
#ex 3
brown_categories = brown.categories()
p(brown.words(categories = [brown_categories[1], brown_categories[2]])[100:])
#ex 4
def tabulate(cfdist, words, categories):
print '%-16s' % 'Category',
for word in words:
print '%6s' % word,
print
for category in categories:
print '%-16s' % category,
for word in words:
print '%6d' % cfdist[category][word],
print
cfd = nltk.ConditionalFreqDist(
(fileid, word)
for fileid in state_union.fileids()
for word in state_union.words(fileid))
tabulate(cfd, ['men', 'women', 'people'], state_union.fileids())
#ex 5
p(wn.synset('book.n.01').part_holonyms())
p(wn.synset('book.n.01').substance_holonyms())
p(wn.synset('book.n.01').member_holonyms())
p(wn.synset('book.n.01').part_meronyms())
p(wn.synset('book.n.01').substance_meronyms())
p(wn.synset('book.n.01').member_meronyms())
#ex 6
Circular translations could result in inaccuracies, even errors. So while translating from one language to another and then translating back, comparing with other languages could be helpful to reduce imperfections.
#ex 7
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
p(emma.concordance('however'))
brn = nltk.Text(nltk.corpus.brown.words('ca01'))
p(brn.concordance('however'))
cht = nltk.Text(state_union.words(state_union.fileids()[0]))
p(cht.concordance('however')) #mistake in the usage of however!
#ex 8
cfd = nltk.ConditionalFreqDist(
(fileid, name[0])
for fileid in names.fileids()
for name in names.words(fileid))
cfd.plot()
#ex 9
#?
#ex 10
#?
#ex 11
#?
#ex 12
cmd = nltk.corpus.cmudict.dict()
p(len(set(map(lambda x: x.lower(),cmd)))) #123455
ctr = 0
for k in cmd.keys():
if(len(cmd[k]) > 1):
ctr = ctr+1
p(ctr) # 9241
#ex 13
ctr = 0
als = list(wn.all_synsets('n'))
for w in als:
if(len(w.hyponyms()) == 0):
ctr = ctr + 1
p(ctr/len(als)) #0.7967119283931072
#ex 14
def supergloss(s):
res = s.definition() +"\n"
for w in s.hyponyms():
res += ' '+ str(w) + ' '+ w.definition() + " \n"
for w in s.hypernyms():
res += ' '+ str(w) + ' '+ w.definition() + " \n"
return res
p(supergloss(wn.synset('dog.n.01')))
#ex 15
p((lambda x: x in brown.words() and brown.words().count(x) >=3, brown.words()))
#ex 16
def lexical_diversity(text):
return len(text) / len(set(text))
cfd = nltk.ConditionalFreqDist(
(category, lexical_diversity(nltk.Text(brown.words(categories=category))))
for category in brown.categories())
cfd.tabulate()
#ex 17
def fifty_most(words):
content = [w for w in words if w.lower() not in stopwords.words('english')]
return nltk.FreqDist(content).most_common(50)
#ex 18
def fifty_most_bigrams(words):
content = [w for w in words if w.lower() not in stopwords.words('english')]
return nltk.FreqDist(nltk.bigrams(content)).most_common(50)
#ex 19
cfd = nltk.ConditionalFreqDist(
(genre, word)
for genre in brown.categories()
for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
words = ['people', 'earth', 'country', 'science', 'sports', 'space', 'love']
cfd.tabulate(conditions=genres, samples=words)
#news category mostly deals with people and country, religion does not talk about science at all, sports is mostly a hobby, romance of course talks about love. Humor, as usual does not give much of a shit about anything.
#ex 20
#section?
#ex 21
d = cmudict.dict()
def nsyl(word):
return [len(list(y for y in x if isdigit(y[-1]))) for x in d[word.lower()]]
#would be updated
Tuesday, June 23, 2015
Natural Language Processing with Python: Chapter 2 Answers
Subscribe to:
Post Comments (Atom)
great (y)
ReplyDelete