Open clhl opened 6 years ago
Sourecode:
#This line is added to reference the dataframe
df = pd.read_csv("pre-processed_file_1.csv", sep=",")
def subject_verb_object_triples(doc):
"""
Extract an ordered sequence of subject-verb-object (SVO) triples from a
spacy-parsed doc. Note that this only works for SVO languages.
Args:
doc (``textacy.Doc`` or ``spacy.Doc`` or ``spacy.Span``)
Yields:
Tuple[``spacy.Span``, ``spacy.Span``, ``spacy.Span``]: The next 3-tuple
of spans from ``doc`` representing a (subject, verb, object) triple,
in order of appearance.
"""
# TODO: What to do about questions, where it may be VSO instead of SVO?
# TODO: What about non-adjacent verb negations?
# TODO: What about object (noun) negations?
if isinstance(doc, SpacySpan):
sents = [doc]
else: # textacy.Doc or spacy.Doc
sents = doc.sents
for sent in sents:
start_i = sent[0].i
verbs = spacy_utils.get_main_verbs_of_sent(sent)
for verb in verbs:
subjs = spacy_utils.get_subjects_of_verb(verb)
if not subjs:
continue
objs = spacy_utils.get_objects_of_verb(verb)
if not objs:
continue
# add adjacent auxiliaries to verbs, for context
# and add compounds to compound nouns
verb_span = spacy_utils.get_span_for_verb_auxiliaries(verb)
verb = sent[verb_span[0] - start_i: verb_span[1] - start_i + 1]
for subj in subjs:
subj = sent[spacy_utils.get_span_for_compound_noun(subj)[0] - start_i: subj.i - start_i + 1]
for obj in objs:
#This line is added to define id
id = df.id
if obj.pos == NOUN:
span = spacy_utils.get_span_for_compound_noun(obj)
elif obj.pos == VERB:
span = spacy_utils.get_span_for_verb_auxiliaries(obj)
else:
span = (obj.i, obj.i)
obj = sent[span[0] - start_i: span[1] - start_i + 1]
#This line is changed to add ', id)' so it yields the 4 columns
yield (subj, verb, obj, id)