clhl / SVO_extract

1 stars 0 forks source link

Code so far #1

Open clhl opened 6 years ago

clhl commented 6 years ago
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.metrics import edit_distance
import spacy
import textacy
import csv, string, re
import jsonrpc
import numpy as np
from simplejson import loads
from textblob import TextBlob
from itertools import chain
import datetime as dt
import pandas as pd

#Set working directory
import os
os.chdir('/Users/charlottelousada/Documents/Thesis working file')
#Import csv file with pre-processing already carried out
import pandas as pd
df = pd.read_csv("pre-processed_file_1.csv", sep=",")
#Format dates
df['created_at'] = df['created_at'].str[4:]
df['created_at'] = df['created_at'].str[:7]+df['created_at'].str[22:]
df['date'] = df['created_at'].apply(lambda x: dt.datetime.strptime(x,'%b %d %Y'))

"""
#Prepare dataframe to be relevant columns and unicode
import StringIO
s = StringIO.StringIO()
tweets = df1.to_csv(encoding='utf-8');
nlp = spacy.load('en')

#Make dataframe with SVO extraction
count = 0;
df2 = pd.DataFrame();
for index, row in df.loc[:, ['text_1', 'id']].iterrows():
  doc = nlp(unicode(row));
  text_ext = textacy.extract.subject_verb_object_triples(doc);
  mylist = list(text_ext)
  count+=1;
  df2 = df2.append(mylist, ignore_index=True)

#Join dataframe to attach ID
df2.columns = ['Subject', 'Verb', 'Object','ID']
print df2
clhl commented 6 years ago
Sourecode:
#This line is added to reference the dataframe
df = pd.read_csv("pre-processed_file_1.csv", sep=",")

def subject_verb_object_triples(doc):
    """
    Extract an ordered sequence of subject-verb-object (SVO) triples from a
    spacy-parsed doc. Note that this only works for SVO languages.
    Args:
        doc (``textacy.Doc`` or ``spacy.Doc`` or ``spacy.Span``)
    Yields:
        Tuple[``spacy.Span``, ``spacy.Span``, ``spacy.Span``]: The next 3-tuple
        of spans from ``doc`` representing a (subject, verb, object) triple,
        in order of appearance.
    """
    # TODO: What to do about questions, where it may be VSO instead of SVO?
    # TODO: What about non-adjacent verb negations?
    # TODO: What about object (noun) negations?
    if isinstance(doc, SpacySpan):
        sents = [doc]
    else:  # textacy.Doc or spacy.Doc
        sents = doc.sents

    for sent in sents:
        start_i = sent[0].i

        verbs = spacy_utils.get_main_verbs_of_sent(sent)
        for verb in verbs:
            subjs = spacy_utils.get_subjects_of_verb(verb)
            if not subjs:
                continue
            objs = spacy_utils.get_objects_of_verb(verb)
            if not objs:
                continue

            # add adjacent auxiliaries to verbs, for context
            # and add compounds to compound nouns
            verb_span = spacy_utils.get_span_for_verb_auxiliaries(verb)
            verb = sent[verb_span[0] - start_i: verb_span[1] - start_i + 1]
            for subj in subjs:
                subj = sent[spacy_utils.get_span_for_compound_noun(subj)[0] - start_i: subj.i - start_i + 1]
                for obj in objs:
#This line is added to define id
            id = df.id
                    if obj.pos == NOUN:
                        span = spacy_utils.get_span_for_compound_noun(obj)
                    elif obj.pos == VERB:
                        span = spacy_utils.get_span_for_verb_auxiliaries(obj)
                    else:
                        span = (obj.i, obj.i)
                    obj = sent[span[0] - start_i: span[1] - start_i + 1]
#This line is changed to add ', id)' so it yields the 4 columns
                    yield (subj, verb, obj, id)