Open anirbanrkmission opened 4 years ago
text[i]=re.sub(r' parenting.wpaddictive.com/\S+',",text[i]) text[i]=re.sub(r' conclud.com/\S+',",text[i]) text[i]=re.sub(r'mummuddlingthrough.com/\S+',",text[i]) text[i] = re.sub(r'www.instagram.com/\S+', '', text[i]) text[i] = re.sub(r'twitter.com/\S+', '', text[i]) text[i] = re.sub(r'cgispread.com/\S+', '', text[i]) text[i] = re.sub(r'curiousdaydreams.com/\S+', '', text[i]) text[i] = re.sub(r'Boom.lnk.to/\S+', '', text[i]) text[i] = re.sub(r'etsy.me/\S+', '', text[i])
#from 3236 to 3095
Apply to data and add screenshots. @sayendutta
`text[i] = text[i].lower()
text[i] = re.sub(r'@[A-Za-z0-9]+', '', text[i])
text[i] = re.sub(r'#', '', text[i])
text[i] = re.sub(r'RT[\s]+', '', text[i])
text[i] = re.sub(r'https?:\/\/\S+', '', text[i])
text[i] = re.sub(r'https?:\/\/', '', text[i])
text[i] = re.sub(r'photos.app.goo.gl/\S+', '', text[i])
text[i] = re.sub(r'pic.twitter.com/\S+', '', text[i])
text[i] = re.sub(r'instagram.com/\S+', '', text[i])
text[i]=re.sub(r' parenting.wpaddictive.com/\S+','',text[i])
text[i]=re.sub(r' conclud.com/\S+','',text[i])
text[i]=re.sub(r'mummuddlingthrough.com/\S+','',text[i])
#text[i] = re.sub(r'www.instagram.com/\S+', '', text[i])
text[i] = re.sub(r'twitter.com/\S+', '', text[i])
text[i] = re.sub(r'cgispread.com/\S+', '', text[i])
text[i] = re.sub(r'curiousdaydreams.com/\S+', '', text[i])
text[i] = re.sub(r'Boom.lnk.to/\S+', '', text[i])
text[i] = re.sub(r'etsy.me/\S+', '', text[i])`
it is giving following output:
original: 'Happy Mothers Day to the craziest woman I know! Thanks for always pushing me to be the best person I can be https://www. instagram.com/p/B-CutO5AZqsj FDdexTkakEBNZvTenYCmaMLD5M0/?igshid=1cq87e54bsk22 …', "Happy Mother's Day to my amazing wife! We both love you like crazy, even though you're a bit weird. Xxx #MothersDay #MotheringSunday https://www. instagram.com/p/B-CSLKgnpBlg 1UWh-kBywdBCSmWFxJ71fC7vZ80/?igshid=7nx6dbn42pi0 …", 'Wishing you all a safe & happy Mothers Day #mothersday #reedham #relax @Reedham ferry Inn. https://www. instagram.com/p/B-Coi97jcDG/ ?igshid=1wk66ouvsqkt3 …'
processed: 'happy mothers day to the craziest woman i know! thanks for always pushing me to be the best person i can be fddextkakebnzvtenycmamld5m0/?igshid=1cq87e54bsk22 …' "happy mother's day to my amazing wife! we both love you like crazy, even though you're a bit weird. xxx mothersday motheringsunday 1uwh-kbywdbcsmwfxj71fc7vz80/?igshid=7nx6dbn42pi0 …" 'wishing you all a safe & happy mothers day mothersday reedham relax ferry inn. ?igshid=1wk66ouvsqkt3 …'
`def clean_text(text): for i in range(len(text)): text[i] = text[i].lower() text[i] = re.sub(r'@[A-Za-z0-9]+', '', text[i]) text[i] = re.sub(r'#', '', text[i]) text[i] = re.sub(r'RT[\s]+', '', text[i]) text[i] = re.sub(r'https?:\/\/\S+', '', text[i]) text[i] = re.sub(r'https?:\/\/', '', text[i]) text[i] = re.sub(r'photos.app.goo.gl/\S+', '', text[i]) text[i] = re.sub(r'pic.twitter.com/\S+', '', text[i]) text[i] = re.sub(r'instagram.com/\S+', '', text[i]) text[i] = re.sub(r"[^a-zA-Z0-9]+", ' ', text[i]) text[i] = re.sub(r'\w\d+\w\d*', '', text[i]) text[i] = re.sub(r'igshid', '', text[i])
text[i] = re.sub(r'[^\w\s]','',text[i])
text: numpy array of sentences. re: regEx module return the processed text in a np array format