Open satishbhambri91 opened 6 years ago
Considering two reflections submitted by the student Eval.txt and Eval2.txt, The Corpus file basically collects these two documents and preforms the removal of non required commonly used words using Term Frequencies(TF) and Inverse Document Frequencies(IDF). The code file and relfections are attached. The next step is to perform latent semantic analysis and then implement NLP techniques for keyword extraction.
import os
import sys
import gensim
import pandas as pd
import nltk
import math
doc = open("/Users/satishbhambri/Desktop/Eval.txt", "r+")
doc11 = open("/Users/satishbhambri/Desktop/Eval2.txt", "r+")
A = doc.read()
A11 = doc11.read()
B = A.split(" ")
B11 = A11.split(" ")
#print(B)
doc2 = open("/Users/satishbhambri/Desktop/EvalCorpus.txt", "w")
doc2.write(str(B))
doc2.close()
doc211 = open("/Users/satishbhambri/Desktop/Eval2Corpus.txt", "w")
doc211.write(str(B11))
doc211.close()
doc3 = open("/Users/satishbhambri/Desktop/EvalCorpus.txt", "r+")
#print("The document has been created")
C = doc3.read()
#print(C)
doc4 = set(B) doc411 = set(B11)
wordDict = dict.fromkeys(doc4 , 0) wordDict11 = dict.fromkeys(doc411, 0)
for word in B: wordDict[word] += 1 for word11 in B11: wordDict11[word11] += 1
def computeTF(wordDict, doc): tfDict = {} bowCount = len(doc)
for word, count in wordDict.items():
tfDict[word] = count / float(bowCount)
return tfDict
tfBoW = computeTF(wordDict, B) tfBoW11 = computeTF(wordDict11, B11) print(tfBoW) print("And ...") print(tfBoW11)
def computeIDF(docList): idfDict = {} N = len(docList)
#Count the number of documents that contain the word w
idfDict = dict.fromkeys(docList[0].keys(), 0)
for doc in docList:
for word,val in doc.items():
if val > 0:
try:
idfDict[word] += 1
except:
idfDict[word] = 1 # check this try catch block once, just put in to run the code
#Divide N by denominator above and take log of that
for word, val in idfDict.items():
idfDict[word] = math.log(N/float(val))
return idfDict
idfs = computeIDF([wordDict, wordDict11]) print(idfs)
def computeTFIDF(tfBow, idfs): tfidf = {} for word, val in tfBow.items(): tfidf[word] = val * idfs[word] return tfidf
tfidfBowA = computeTFIDF(tfBoW, idfs) tfidfBowB = computeTFIDF(tfBoW11, idfs)
print("Finally..")
print(tfidfBowA) print(tfidfBowB)
Eval2Corpus.txt EvalCorpus.txt
Output : {'prove': 0.00525111500424201, 'something': 0.00525111500424201, 'important': 0.00525111500424201, 'second': 0.00525111500424201, 'weight.\n\t': 0.00525111500424201, 'algorithm': 0.01050223000848402, 'member': 0.00525111500424201, 'pushed': 0.00525111500424201, 'I’m': 0.00525111500424201, 'our': 0.0, 'proud': 0.00525111500424201, 'instead': 0.00525111500424201, 'was': 0.0, 'honest': 0.00525111500424201, 'and': 0.0, 'up': 0.00525111500424201, 'perfectly': 0.00525111500424201, 'still': 0.00525111500424201, 'do': 0.01050223000848402, 'knew': 0.00525111500424201, 'lonesome.': 0.00525111500424201, 'I': 0.0, 'announced': 0.00525111500424201, 'my': 0.0, 'like': 0.01050223000848402, 'doing': 0.00525111500424201, 'sorting': 0.01050223000848402, 'of': 0.0, 'To': 0.00525111500424201, 'consensus': 0.00525111500424201, 'When': 0.00525111500424201, 'submitting,': 0.00525111500424201, 'Whether': 0.00525111500424201, 'project': 0.0, 'being': 0.01050223000848402, 'incredibly': 0.00525111500424201, 'however': 0.0, '\n': 0.00525111500424201, 'it': 0.0, 'sort': 0.0, 'going': 0.00525111500424201, 'even': 0.00525111500424201, 'final': 0.01050223000848402, 'a': 0.0, 'pressure': 0.00525111500424201, '4': 0.00525111500424201, 'team': 0.01050223000848402, 'intimidated': 0.00525111500424201, 'Professor': 0.00525111500424201, 'nervous.': 0.00525111500424201, 'assigned': 0.00525111500424201, 'group,': 0.00525111500424201, 'an': 0.00525111500424201, 'would': 0.00525111500424201, 'the': 0.0, 'different.': 0.00525111500424201, 'part': 0.0, 'hard': 0.00525111500424201, 'myself;': 0.00525111500424201, 'that': 0.0, 'we': 0.0, 'he': 0.00525111500424201, 'dead': 0.00525111500424201, 'wasn’t': 0.00525111500424201, 'initially': 0.00525111500424201, 'sorts': 0.00525111500424201, 'be': 0.0, 'before': 0.0, 'on': 0.0, 'worked': 0.00525111500424201, 'end': 0.00525111500424201, 'this': 0.0, 'very': 0.0, 'finished': 0.0, 'so,': 0.00525111500424201, '\n\tThis': 0.00525111500424201, 'me': 0.0, 'or': 0.00525111500424201, 'by': 0.00525111500424201, 'feeling': 0.00525111500424201, 'to': 0.0, 'hard.': 0.00525111500424201, 'expected': 0.00525111500424201, 'My': 0.00525111500424201, 'code.': 0.00525111500424201, 'desire': 0.00525111500424201, 'It': 0.00525111500424201, 'about': 0.00525111500424201, 'as': 0.0} {'gave': 0.00525111500424201, 'Gnome': 0.00525111500424201, 'most': 0.00525111500424201, 'completely': 0.00525111500424201, 'come,': 0.00525111500424201, 'a': 0.0, 'intimidating': 0.00525111500424201, 'didn’t': 0.00525111500424201, 'on': 0.0, 'yet': 0.00525111500424201, 'we': 0.0, 'day': 0.00525111500424201, 'were': 0.01575334501272603, 'sort': 0.0, 'our': 0.0, 'needed': 0.00525111500424201, 'it.': 0.00525111500424201, 'as': 0.0, 'was': 0.0, 'once': 0.00525111500424201, 'task': 0.00525111500424201, 'team.': 0.00525111500424201, 'who': 0.00525111500424201, 'time.': 0.00525111500424201, 'reliable': 0.01050223000848402, 'in': 0.01050223000848402, 'part': 0.0, 'submitted': 0.00525111500424201, 'Obviously': 0.00525111500424201, 'selection': 0.02100446001696804, 'sort,': 0.00525111500424201, 'how': 0.00525111500424201, 'teammates': 0.01050223000848402, 'teammate': 0.00525111500424201, 'Two': 0.01050223000848402, 'I': 0.0, 'teaching': 0.00525111500424201, 'confidence': 0.00525111500424201, 'working': 0.02100446001696804, 'and': 0.0, 'strong': 0.00525111500424201, 'my': 0.0, 'be': 0.0, 'wildcard': 0.00525111500424201, 'submission.': 0.00525111500424201, 'of': 0.0, 'came': 0.00525111500424201, 'finish': 0.00525111500424201, 'manage': 0.00525111500424201, 'this': 0.0, 'felt': 0.00525111500424201, 'that': 0.0, 'might': 0.00525111500424201, 'entirely': 0.00525111500424201, 'very': 0.0, 'finished': 0.0, 'time,': 0.00525111500424201, 'project': 0.0, 'me': 0.0, 'however': 0.0, 'However': 0.00525111500424201, 'than': 0.00525111500424201, 'written': 0.01050223000848402, 'it': 0.0, 'the': 0.0, 'submission': 0.00525111500424201, 'to': 0.0, 'programmer,': 0.00525111500424201, 'change': 0.00525111500424201, 'faster': 0.01050223000848402, 'because': 0.00525111500424201, 'good': 0.00525111500424201, 'given': 0.00525111500424201, 'well': 0.00525111500424201, 'before': 0.0, 'implement': 0.00525111500424201, 'sort.': 0.00525111500424201, 'This': 0.00525111500424201, 'possibility': 0.00525111500424201}
Hence, the commonly used words like "I", "it", "the" yield score of 0 and could be removed.
Process finished with exit code 0
This part involves
[“document”, “story”, “machine translation”, “translation”, “figure”] -> [“machine translation”]
, performing NLP analysis,Word2Vec
,Doc2Vec
,