SeverinJB / the_lads

Project "Scholarly Network Engine" - Examination for "Computational Thinking and Programming" - Second-cycle degree "Digital Humanities and Digital Knowledge" at the University of Bologna
0 stars 0 forks source link

the_lads.py #8

Open delfimpandiani opened 5 years ago

delfimpandiani commented 5 years ago

Final code, almost ready to be turned in. Final issues to resolve:

  1. is do_aut_distance here correct and the final version to be turned in? Done! ✔️
  2. were we supposed to do test-driven development? i.e., should we provide tests?
import pandas as pd
import networkx as nx
import datetime as date

def process_citation_data(file_path):
    global ref_data
    df = pd.read_csv(file_path, index_col='DOI', header=0, names=['DOI', 'citation_number', 'known_reference'])

    ref_data = df.known_reference.str.split(';', expand=True) \
        .stack(dropna=True) \
        .reset_index() \
        .rename(columns={0: 'known_reference'})

    ref_data['known_reference'] = ref_data['known_reference'].str.strip()
    ref_data['DOI'] = ref_data['DOI'].str.strip()

    return df

def do_citation_graph(data, sse):
    global citation_structure, citation_graph

    citation_structure = nx.from_pandas_edgelist(ref_data, 'DOI', 'known_reference', create_using=nx.MultiDiGraph())
    dict_pretty_names = {item['doi']: item for item in sse.data if item['doi'] in citation_structure}

    for each_book in dict_pretty_names:
        sse_input = dict_pretty_names[each_book]
        dict_pretty_names[each_book] = sse.pretty_print([sse_input])[0]

    citation_graph = nx.relabel_nodes(citation_structure, dict_pretty_names, copy=True)

    return citation_graph

def do_coupling(data, sse, doi_1, doi_2, coupling=0):
    if 'citation_structure' not in globals():
        do_citation_graph(data, sse)

    if doi_1 in citation_structure and doi_2 in citation_structure:
        coupling = len([0 for item in citation_structure[doi_1] if item in citation_structure[doi_2]])

    return coupling

def do_aut_coupling(data, sse, aut_1, aut_2):
    aut_1_cit, aut_2_cit = [], []

    for row in sse.data:
        authors = row['authors'].split('; ')
        if aut_1 in authors and aut_2 not in authors:
            aut_1_cit.extend(ref_data[ref_data['DOI'] == row['doi']]['known_reference'])
        if aut_2 in authors and aut_1 not in authors:
            aut_2_cit.extend(ref_data[ref_data['DOI'] == row['doi']]['known_reference'])

    aut_coupling = len([item for item in aut_1_cit if item in aut_2_cit])

    return aut_coupling

def do_aut_distance(data, sse, aut):
    aut_distance = nx.MultiGraph()
    queue = [aut]

    for node in queue:
        aut_distance = nx.compose(aut_distance, nx.MultiGraph(sse.coauthor_network(node)))
        queue.extend([item for item in aut_distance.neighbors(node) if item not in queue])

    distance = nx.single_source_shortest_path_length(aut_distance, aut)
    nx.set_node_attributes(aut_distance, distance, "Distance")

    return aut_distance

def do_find_cycles(data, sse):
    if 'citation_graph' not in globals():
        do_citation_graph(data, sse)

    generator_object = nx.simple_cycles(citation_graph)
    find_cycles = [tuple(item) for item in generator_object]

    return find_cycles

def do_cit_count_year(data, sse, aut, year):
    doi_dict = {}
    years_aut = [int(paper['year']) for paper in sse.data if aut in paper['authors']]
    min_years_aut = min(years_aut)
    max_years_aut = max(years_aut)

    if year is None:
        max_year = max([max_years_aut, date.datetime.today().year])
        cit_dict = {item: 0 for item in range(min_years_aut, max_year + 1)}
    else:
        max_year = max([max_years_aut, date.datetime.today().year, year])
        cit_dict = {item: 0 for item in range(year, max_year + 1)}

    for item in sse.data:
        if aut in item['authors']:
            if (year is not None and year <= int(item['year'])) or year is None:
                doi_dict[item['doi']] = int(item['year'])

    for index, row in data.iterrows():
        if index in doi_dict:
            cit_dict[doi_dict[index]] = cit_dict[doi_dict[index]] + row['citation_number']

    return cit_dict
SeverinJB commented 5 years ago
import pandas as pd
import networkx as nx
import datetime as date

def process_citation_data(file_path):
    df = pd.read_csv(file_path, index_col='DOI', header=0, names=['DOI', 'citation_number', 'known_reference'])
    return df

def data_citation_graph(data):
    global cit_data

    cit_data = data.known_reference.str.split(';', expand=True) \
        .stack(dropna=True) \
        .reset_index() \
        .rename(columns={0: 'known_reference'})

    cit_data['known_reference'] = cit_data['known_reference'].str.strip()
    cit_data['DOI'] = cit_data['DOI'].str.strip()

    return cit_data

def do_citation_graph(data, sse):
    global citation_structure, citation_graph

    if 'cit_data' not in globals():
        data_citation_graph(data)

    citation_structure = nx.from_pandas_edgelist(cit_data, 'DOI', 'known_reference', create_using=nx.MultiDiGraph())
    dict_pretty_names = {item['doi']: item for item in sse.data if item['doi'] in citation_structure}

    for each_book in dict_pretty_names:
        sse_input = dict_pretty_names[each_book]
        dict_pretty_names[each_book] = sse.pretty_print([sse_input])[0]

    citation_graph = nx.relabel_nodes(citation_structure, dict_pretty_names, copy=True)

    return citation_graph

def do_coupling(data, sse, doi_1, doi_2, coupling=0):
    if 'citation_structure' not in globals():
        do_citation_graph(data, sse)

    if (doi_1 and doi_2) in citation_structure:
        coupling = len([0 for item in citation_structure[doi_1] if item in citation_structure[doi_2]])

    return coupling

def do_aut_coupling(data, sse, aut_1, aut_2):
    aut_1_cit, aut_2_cit = [], []
    if 'cit_data' not in globals():
        data_citation_graph(data)

    for row in sse.data:
        authors = row['authors'].split('; ')
        if aut_1 in authors and aut_2 not in authors:
            aut_1_cit.extend(cit_data[cit_data['DOI'] == row['doi']]['known_reference'])
        if aut_2 in authors and aut_1 not in authors:
            aut_2_cit.extend(cit_data[cit_data['DOI'] == row['doi']]['known_reference'])

    aut_coupling = len([item for item in aut_1_cit if item in aut_2_cit])

    return aut_coupling

def do_aut_distance(data, sse, aut):
    aut_distance = nx.MultiGraph()

    def assistant(sse, aut, tmp, visited=list()):
        visited.append(aut)
        tmp = nx.compose(tmp, nx.MultiGraph(sse.coauthor_network(aut)))

        for node in tmp:
            if node not in visited:
                tmp = assistant(sse, node, tmp)

        return tmp

    aut_distance = assistant(sse, aut, aut_distance)

    distance = nx.single_source_shortest_path_length(aut_distance, aut)
    nx.set_node_attributes(aut_distance, distance, "Distance")

    return aut_distance

def do_find_cycles(data, sse):
    if 'citation_graph' not in globals():
        do_citation_graph(data, sse)

    generator_object = nx.simple_cycles(citation_graph)
    find_cycles = [tuple(item) for item in generator_object]

    return find_cycles

def do_cit_count_year(data, sse, aut, year):
    doi_year = {}

    min_year = int(min([each_book['year'] for each_book in sse.data]))

    if year is None:
        cit_count_year = {str(item): 0 for item in range(min_year, date.datetime.today().year + 1)}
    elif year < date.datetime.today().year:
        cit_count_year = {str(item): 0 for item in range(year, date.datetime.today().year + 1)}
    elif year > date.datetime.today().year:
        cit_count_year = {str(year): 0}

    for item in sse.data:
        if year is not None and year <= int(item['year']):
            cit_count_year[item['year']] = 0
            if aut in item['authors']:
                doi_year[item['doi']] = item['year']
        elif year is None:
            cit_count_year[item['year']] = 0
            if aut in item['authors']:
                doi_year[item['doi']] = item['year']

    for index, row in data.iterrows():
        if index in doi_year:
            cit_count_year[doi_year[index]] = cit_count_year[doi_year[index]] + row['citation_number']

    return cit_count_year