WDscholia / scholia

Wikidata-based scholarly profiles
https://scholia.toolforge.org
Other
223 stars 81 forks source link

Find citations in Python code #242

Open fnielsen opened 6 years ago

fnielsen commented 6 years ago

Some Python code may contain citations to scientific articles in the docstrings. With the introspection capabilities in Python it should be possible to find and extract these citations.

fnielsen commented 6 years ago

Attempt:

from ast import *
import re
from pprint import pprint
import os.path

filename = "/usr/local/lib/python2.7/dist-packages/gensim/__init__.py"
filename = "/usr/local/lib/python2.7/dist-packages/gensim/models/word2vec.py"
parsed = parse(open(filename).read())

package = 'sklearn'

citations = []
directory = os.path.dirname(importlib.import_module(package).__file__)
for root, current_directory, filenames in os.walk(directory):
    for filename in filenames:
        if filename.endswith('.py'):
            full_filename = os.path.join(root, filename)
            print(full_filename)
            citations.extend(filename_to_citations(full_filename))

STRING = """
.. [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean.
       Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013.
.. [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean.
"""

PATTERN = re.compile(r'^\.\. \[[^\]]+\] (.+?(?:$\s{7,20}.+?)*)$', flags=re.DOTALL | re.MULTILINE)

def docstring_to_citations(docstring):
    extracted = PATTERN.findall(docstring)
    return [re.sub(r'\s+', ' ', text, flags=re.DOTALL) for text in extracted]

docstring_to_citations(STRING)

def filename_to_citations(filename):
    parsed = parse(open(filename).read())
    citations = []
    for n, node in enumerate(walk(parsed)):
        try:
            docstring = get_docstring(node)
            if docstring:
                citations.extend(docstring_to_citations(docstring))
        except:
            pass
    return citations

pprint(citations)