import pprint from nltk.corpus import wordnet as wn import nltk from nltk.corpus import * from nltk.corpus import state_union from matplotlib import pylab def p(x): pprint.pprint(x) #ex 1 sent = "The quick brown fox jumps over the lazy dog" phrase = sent.split() phrase.append('extra words') #addition for w in phrase: p(phrase.index(w)) #indexing p(phrase*2) #multiplication p(phrase[2:5]) #slicing p(sorted(map(lambda x: x.lower(),phrase))) #ex 2 austen = gutenberg.words('austen-persuasion.txt') p(len(map(lambda x: x.lower(),austen))) p(len(set(map(lambda x: x.lower(),austen)))) #ex 3 brown_categories = brown.categories() p(brown.words(categories = [brown_categories[1], brown_categories[2]])[100:]) #ex 4 def tabulate(cfdist, words, categories): print '%-16s' % 'Category', for word in words: print '%6s' % word, print for category in categories: print '%-16s' % category, for word in words: print '%6d' % cfdist[category][word], print cfd = nltk.ConditionalFreqDist( (fileid, word) for fileid in state_union.fileids() for word in state_union.words(fileid)) tabulate(cfd, ['men', 'women', 'people'], state_union.fileids()) #ex 5 p(wn.synset('book.n.01').part_holonyms()) p(wn.synset('book.n.01').substance_holonyms()) p(wn.synset('book.n.01').member_holonyms()) p(wn.synset('book.n.01').part_meronyms()) p(wn.synset('book.n.01').substance_meronyms()) p(wn.synset('book.n.01').member_meronyms()) #ex 6 Circular translations could result in inaccuracies, even errors. So while translating from one language to another and then translating back, comparing with other languages could be helpful to reduce imperfections. #ex 7 emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt')) p(emma.concordance('however')) brn = nltk.Text(nltk.corpus.brown.words('ca01')) p(brn.concordance('however')) cht = nltk.Text(state_union.words(state_union.fileids()[0])) p(cht.concordance('however')) #mistake in the usage of however! #ex 8 cfd = nltk.ConditionalFreqDist( (fileid, name[0]) for fileid in names.fileids() for name in names.words(fileid)) cfd.plot() #ex 9 #? #ex 10 #? #ex 11 #? #ex 12 cmd = nltk.corpus.cmudict.dict() p(len(set(map(lambda x: x.lower(),cmd)))) #123455 ctr = 0 for k in cmd.keys(): if(len(cmd[k]) > 1): ctr = ctr+1 p(ctr) # 9241 #ex 13 ctr = 0 als = list(wn.all_synsets('n')) for w in als: if(len(w.hyponyms()) == 0): ctr = ctr + 1 p(ctr/len(als)) #0.7967119283931072 #ex 14 def supergloss(s): res = s.definition() +"\n" for w in s.hyponyms(): res += ' '+ str(w) + ' '+ w.definition() + " \n" for w in s.hypernyms(): res += ' '+ str(w) + ' '+ w.definition() + " \n" return res p(supergloss(wn.synset('dog.n.01'))) #ex 15 p((lambda x: x in brown.words() and brown.words().count(x) >=3, brown.words())) #ex 16 def lexical_diversity(text): return len(text) / len(set(text)) cfd = nltk.ConditionalFreqDist( (category, lexical_diversity(nltk.Text(brown.words(categories=category)))) for category in brown.categories()) cfd.tabulate() #ex 17 def fifty_most(words): content = [w for w in words if w.lower() not in stopwords.words('english')] return nltk.FreqDist(content).most_common(50) #ex 18 def fifty_most_bigrams(words): content = [w for w in words if w.lower() not in stopwords.words('english')] return nltk.FreqDist(nltk.bigrams(content)).most_common(50) #ex 19 cfd = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] words = ['people', 'earth', 'country', 'science', 'sports', 'space', 'love'] cfd.tabulate(conditions=genres, samples=words) #news category mostly deals with people and country, religion does not talk about science at all, sports is mostly a hobby, romance of course talks about love. Humor, as usual does not give much of a shit about anything. #ex 20 #section? #ex 21 d = cmudict.dict() def nsyl(word): return [len(list(y for y in x if isdigit(y[-1]))) for x in d[word.lower()]] #would be updated
Tuesday, June 23, 2015
Natural Language Processing with Python: Chapter 2 Answers
Subscribe to:
Post Comments (Atom)
great (y)
ReplyDelete