Open delfimpandiani opened 5 years ago
import pandas as pd
import networkx as nx
import datetime as date
def process_citation_data(file_path):
df = pd.read_csv(file_path, index_col='DOI', header=0, names=['DOI', 'citation_number', 'known_reference'])
return df
def data_citation_graph(data):
global cit_data
cit_data = data.known_reference.str.split(';', expand=True) \
.stack(dropna=True) \
.reset_index() \
.rename(columns={0: 'known_reference'})
cit_data['known_reference'] = cit_data['known_reference'].str.strip()
cit_data['DOI'] = cit_data['DOI'].str.strip()
return cit_data
def do_citation_graph(data, sse):
global citation_structure, citation_graph
if 'cit_data' not in globals():
data_citation_graph(data)
citation_structure = nx.from_pandas_edgelist(cit_data, 'DOI', 'known_reference', create_using=nx.MultiDiGraph())
dict_pretty_names = {item['doi']: item for item in sse.data if item['doi'] in citation_structure}
for each_book in dict_pretty_names:
sse_input = dict_pretty_names[each_book]
dict_pretty_names[each_book] = sse.pretty_print([sse_input])[0]
citation_graph = nx.relabel_nodes(citation_structure, dict_pretty_names, copy=True)
return citation_graph
def do_coupling(data, sse, doi_1, doi_2, coupling=0):
if 'citation_structure' not in globals():
do_citation_graph(data, sse)
if (doi_1 and doi_2) in citation_structure:
coupling = len([0 for item in citation_structure[doi_1] if item in citation_structure[doi_2]])
return coupling
def do_aut_coupling(data, sse, aut_1, aut_2):
aut_1_cit, aut_2_cit = [], []
if 'cit_data' not in globals():
data_citation_graph(data)
for row in sse.data:
authors = row['authors'].split('; ')
if aut_1 in authors and aut_2 not in authors:
aut_1_cit.extend(cit_data[cit_data['DOI'] == row['doi']]['known_reference'])
if aut_2 in authors and aut_1 not in authors:
aut_2_cit.extend(cit_data[cit_data['DOI'] == row['doi']]['known_reference'])
aut_coupling = len([item for item in aut_1_cit if item in aut_2_cit])
return aut_coupling
def do_aut_distance(data, sse, aut):
aut_distance = nx.MultiGraph()
def assistant(sse, aut, tmp, visited=list()):
visited.append(aut)
tmp = nx.compose(tmp, nx.MultiGraph(sse.coauthor_network(aut)))
for node in tmp:
if node not in visited:
tmp = assistant(sse, node, tmp)
return tmp
aut_distance = assistant(sse, aut, aut_distance)
distance = nx.single_source_shortest_path_length(aut_distance, aut)
nx.set_node_attributes(aut_distance, distance, "Distance")
return aut_distance
def do_find_cycles(data, sse):
if 'citation_graph' not in globals():
do_citation_graph(data, sse)
generator_object = nx.simple_cycles(citation_graph)
find_cycles = [tuple(item) for item in generator_object]
return find_cycles
def do_cit_count_year(data, sse, aut, year):
doi_year = {}
min_year = int(min([each_book['year'] for each_book in sse.data]))
if year is None:
cit_count_year = {str(item): 0 for item in range(min_year, date.datetime.today().year + 1)}
elif year < date.datetime.today().year:
cit_count_year = {str(item): 0 for item in range(year, date.datetime.today().year + 1)}
elif year > date.datetime.today().year:
cit_count_year = {str(year): 0}
for item in sse.data:
if year is not None and year <= int(item['year']):
cit_count_year[item['year']] = 0
if aut in item['authors']:
doi_year[item['doi']] = item['year']
elif year is None:
cit_count_year[item['year']] = 0
if aut in item['authors']:
doi_year[item['doi']] = item['year']
for index, row in data.iterrows():
if index in doi_year:
cit_count_year[doi_year[index]] = cit_count_year[doi_year[index]] + row['citation_number']
return cit_count_year
Final code, almost ready to be turned in. Final issues to resolve: