Open fnielsen opened 6 years ago
Attempt:
from ast import *
import re
from pprint import pprint
import os.path
filename = "/usr/local/lib/python2.7/dist-packages/gensim/__init__.py"
filename = "/usr/local/lib/python2.7/dist-packages/gensim/models/word2vec.py"
parsed = parse(open(filename).read())
package = 'sklearn'
citations = []
directory = os.path.dirname(importlib.import_module(package).__file__)
for root, current_directory, filenames in os.walk(directory):
for filename in filenames:
if filename.endswith('.py'):
full_filename = os.path.join(root, filename)
print(full_filename)
citations.extend(filename_to_citations(full_filename))
STRING = """
.. [1] Tomas Mikolov, Kai Chen, Greg Corrado, and Jeffrey Dean.
Efficient Estimation of Word Representations in Vector Space. In Proceedings of Workshop at ICLR, 2013.
.. [2] Tomas Mikolov, Ilya Sutskever, Kai Chen, Greg Corrado, and Jeffrey Dean.
"""
PATTERN = re.compile(r'^\.\. \[[^\]]+\] (.+?(?:$\s{7,20}.+?)*)$', flags=re.DOTALL | re.MULTILINE)
def docstring_to_citations(docstring):
extracted = PATTERN.findall(docstring)
return [re.sub(r'\s+', ' ', text, flags=re.DOTALL) for text in extracted]
docstring_to_citations(STRING)
def filename_to_citations(filename):
parsed = parse(open(filename).read())
citations = []
for n, node in enumerate(walk(parsed)):
try:
docstring = get_docstring(node)
if docstring:
citations.extend(docstring_to_citations(docstring))
except:
pass
return citations
pprint(citations)
Some Python code may contain citations to scientific articles in the docstrings. With the introspection capabilities in Python it should be possible to find and extract these citations.