Thanks a lot for updating this!

p-acharya commented 3 months ago

Thanks a lot for updating this to use Stanza.

Would it be possible to update the README on how to use the command line tool? Also, how can I replicate the following script using neosca?

This is taken from the original L2SCA code

import sys
import os
import subprocess
import re
import tempfile

def division(x, y):
    if float(x) == 0 or float(y) == 0:
        return 0
    return float(x) / float(y)

# List of tregex patterns for various structures
patternlist = [
    "'ROOT !> __'",
    "'VP > S|SINV|SQ'",
    "'S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP|VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])]'",
    "'S|SBARQ|SINV|SQ > ROOT | [$-- S|SBARQ|SINV|SQ !>> SBAR|VP]'",
    "'SBAR < (S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP|VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])])'",
    "'S|SBARQ|SINV|SQ [> ROOT | [$-- S|SBARQ|SINV|SQ !>> SBAR|VP]] << (SBAR < (S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP|VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])]))'",
    "'ADJP|ADVP|NP|VP < CC'",
    "'NP !> NP [<< JJ|POS|PP|S|VBG | << (NP $++ NP !$+ CC)]'",
    "'SBAR [<# WHNP | <# (IN < That|that|For|for) | <, S] & [$+ VP | > VP]'",
    "'S < (VP <# VBG|TO) $+ VP'",
    "'FRAG > ROOT !<< (S|SINV|SQ [> ROOT <, (VP <# VB) | <# MD|VBZ|VBP|VBD | < (VP [<# MD|VBP|VBZ|VBD | < CC < (VP <# MD|VBP|VBZ|VBD)])])'",
    "'FRAG > ROOT !<< (S|SBARQ|SINV|SQ > ROOT | [$-- S|SBARQ|SINV|SQ !>> SBAR|VP])'",
    "'MD|VBZ|VBP|VBD > (SQ !< VP)'"
]

# Path to the Stanford parser

def analyze_text(raw_text, long_names=False):
    current_dir = os.path.dirname(os.path.abspath(__file__))
    inputFile = os.path.join(current_dir, "input_file_temp.txt")

    with open(inputFile, "w") as temp_file:
        temp_file.write(raw_text)

    # Name a temporary file to hold the parse trees of the input file
    parsedFile = inputFile + ".parsed"

    parserPath = os.path.join(current_dir, "stanford-parser-full-2020-11-17/lexparser.sh")

    # Parse the input file
    command = f"{parserPath} {inputFile} > {parsedFile}"
    subprocess.getoutput(command)

    # List of counts of the patterns
    patterncount = []

    tregex_path = os.path.join(current_dir, "tregex.sh")

    # Query the parse trees using the tregex patterns
    for pattern in patternlist:
        command = f"{tregex_path} {pattern} {parsedFile} -C -o"
        count = subprocess.getoutput(command).split('\n')[-1]
        patterncount.append(int(count))

    # Update frequencies of complex nominals, clauses, and T-units
    patterncount[7] = patterncount[-4] + patterncount[-5] + patterncount[-6]
    patterncount[2] = patterncount[2] + patterncount[-3]
    patterncount[3] = patterncount[3] + patterncount[-2]
    patterncount[1] = patterncount[1] + patterncount[-1]

    # Word count
    with open(parsedFile, "r") as infile:
        content = infile.read()
    w = len(re.findall("\([A-Z]+\$? [^\)\(-]+\)", content))

    # #list of frequencies of structures other than words
    [s,vp,c,t,dc,ct,cp,cn]=patterncount[:8]

    # #compute the 14 syntactic complexity indices
    mls=division(w,s)
    mlt=division(w,t)
    mlc=division(w,c)
    c_s=division(c,s)
    vp_t=division(vp,t)
    c_t=division(c,t)
    dc_c=division(dc,c)
    dc_t=division(dc,t)
    t_s=division(t,s)
    ct_t=division(ct,t)
    cp_t=division(cp,t)
    cp_c=division(cp,c)
    cn_t=division(cn,t)
    cn_c=division(cn,c)

    if long_names:
        measures = {
            "W": w,
            "S": s,
            "VP": vp,
            "C": c,
            "T": t,
            "DC": dc,
            "CT": ct,
            "CP": cp,
            "CN": cn,
            "MLS": mls,
            "MLT": mlt,
            "MLC": mlc,
            "C/S": c_s,
            "VP/T": vp_t,
            "C/T": c_t,
            "DC/C": dc_c,
            "DC/T": dc_t,
            "T/S": t_s,
            "CT/T": ct_t,
            "CP/T": cp_t,
            "CP/C": cp_c,
            "CN/T": cn_t,
            "CN/C": cn_c
        }
    else:
        measures = {
            "words": w,
            "sentences": s,
            "verb phrases": vp,
            "clauses": c,
            "T-units": t,
            "dependent clauses": dc,
            "complex T-units": ct,
            "coordinate phrases": cp,
            "complex nominals": cn,
            "mean length of sentence (MLS)": mls,
            "mean length of T-unit (MLT)": mlt,
            "mean length of clause (MLC)": mlc,
            "clauses per sentence (C/S)": c_s,
            "verb phrases per T-unit (VP/T)": vp_t,
            "clauses per T-unit (C/T)": c_t,
            "dependent clauses per clause (DC/C)": dc_c,
            "dependent clauses per T-unit (DC/T)": dc_t,
            "T-units per sentence (T/S)": t_s,
            "complex T-unit ratio (CT/T)": ct_t,
            "coordinate phrases per T-unit (CP/T)": cp_t,
            "coordinate phrases per clause (CP/C)": cp_c,
            "complex nominals per T-unit (CN/T)": cn_t,
            "complex nominals per clause (CN/C)": cn_c
        }
    for key in measures:
        measures[key] = round(measures[key], 4)

    # Delete the temporary file holding the parse trees
    os.remove(parsedFile)

    return measures

so that I can do this:

CleanShot 2024-07-29 at 10 38 27@2x

i.e. how do i get the same measurements dictionary using neosca?

p-acharya commented 3 months ago

Nvm I got it! Just wrote a quick script:

from neosca.ns_sca.ns_sca import Ns_SCA

def run_sca_on_text(text):
    # Create an instance of Ns_SCA with is_stdout set to True to print the output
    analyzer = Ns_SCA(is_stdout=True, oformat_freq="json")

    # Run the SCA on the input text
    analyzer.run_on_text(text)

    sample_1 =
    """
    Scores of properties are under extreme fire threat as a huge blaze
    continues to advance through Sydney's north-western suburbs. Fires
    have also shut down the major road and rail links between Sydney and
    Gosford.

    The promotional stop in Sydney was everything to be expected for a
    Hollywood blockbuster - phalanxes of photographers, a stretch limo to
    a hotel across the Quay - but with one difference. A line-up of
    masseurs was waiting to take the media in hand. Never has the term
    'massaging the media' seemed so accurate.
    """

# Test the function with a sample text
run_sca_on_text(sample_1)

tanloong commented 3 months ago

That's great! Here is a different way, but whatever works as long as you get the dictionary.

from neosca.ns_sca.ns_sca import Ns_SCA, Ns_SCA_Counter

def analyze_text(text: str) -> dict[str, float]:
    sca.run_on_text(text)
    counter: Ns_SCA_Counter = sca.counters[0]
    name_value_map: dict[str, str] = counter.get_all_values(precision=4)
    name_value_map.pop("Filepath")
    ret: dict[str, float] = {k: float(v) for k, v in name_value_map.items()}
    return ret

def analyze_file(filepath: str) -> dict[str, float]:
    counter: Ns_SCA_Counter = sca.run_on_file_or_subfiles(filepath)
    name_value_map: dict[str, str] = counter.get_all_values(precision=4)
    name_value_map.pop("Filepath")
    ret: dict[str, float] = {k: float(v) for k, v in name_value_map.items()}
    return ret

sca = Ns_SCA()
sample_1 = """
    Scores of properties are under extreme fire threat as a huge
    blaze continues to advance through Sydney's north-western suburbs. Fires have
    also shut down the major road and rail links between Sydney and Gosford.

    The promotional stop in Sydney was everything to be expected for a Hollywood
    blockbuster - phalanxes of photographers, a stretch limo to a hotel across the
    Quay - but with one difference. A line-up of masseuses was waiting to take the
    media in hand. Never has the term "massaging the media" seemed so accurate.
"""
analyze_text(sample_1)

Here is the basic command line usage:

python -m neosca sca --text 'This is a test.'
python -m neosca sca filepath.txt

For descriptions about additional options, use:

python -m neosca --help
python -m neosca sca --help

tanloong / neosca

Thanks a lot for updating this! #44