File ~\anaconda3\lib_strptime.py:568, in _strptime_datetime(cls, data_string, format)
565 def _strptime_datetime(cls, data_string, format="%a %b %d %H:%M:%S %Y"):
566 """Return a class cls instance based on the input string and the
567 format string."""
--> 568 tt, fraction, gmtoff_fraction = _strptime(data_string, format)
569 tzname, gmtoff = tt[-2:]
570 args = tt[:6] + (fraction,)
File ~\anaconda3\lib_strptime.py:349, in _strptime(data_string, format)
347 found = format_regex.match(data_string)
348 if not found:
--> 349 raise ValueError("time data %r does not match format %r" %
350 (data_string, format))
351 if len(data_string) != found.end():
352 raise ValueError("unconverted data remains: %s" %
353 data_string[found.end():])
ValueError: time data 'Financial Times' does not match format '%d %B %Y'
This is the full code that I use for extracting news from a set of files in a folder (which includes the news file attached):
import news_extract as ne
import os, glob
import pandas
directory = '.../FactivaDownloads/'
filenames_fc = []
for filename in glob.glob(os.path.join(directory, "*.txt")):
print(filename)
with open(os.path.join(os.getcwd(), filename), 'r') as f:
filenames_fc += [filename]
Hi,
I get the following error when I use news_extract to read the file below 4629_10.txt
ValueError Traceback (most recent call last) Cell In[32], line 21 19 print(n+1) 20 fc_file = filenames_fc[n] #file exported from Factiva ---> 21 fc_data += ne.factiva_extract(fc_file) 23 data=ne.fix_fac_fieldnames(fc_data) 24 dataframe=ne.news_export(data,to_pandas=True, master_fields = [], jacc_threshold=1.1)
File ~\anaconda3\lib\site-packages\news_extract\news_extract.py:188, in factiva_extract(article_fn) 186 article_dict.update(article_dict3) 187 article_dict['FILENAME'] = article_fn --> 188 article_dict['PD'] = datetime.strptime( 189 article_dict['PD'], 190 '%d %B %Y').isoformat()[:10] 191 article_dict['WC'] = int(article_dict['WC'].replace( 192 " words","")) 193 factiva_list.append(article_dict)
File ~\anaconda3\lib_strptime.py:568, in _strptime_datetime(cls, data_string, format) 565 def _strptime_datetime(cls, data_string, format="%a %b %d %H:%M:%S %Y"): 566 """Return a class cls instance based on the input string and the 567 format string.""" --> 568 tt, fraction, gmtoff_fraction = _strptime(data_string, format) 569 tzname, gmtoff = tt[-2:] 570 args = tt[:6] + (fraction,)
File ~\anaconda3\lib_strptime.py:349, in _strptime(data_string, format) 347 found = format_regex.match(data_string) 348 if not found: --> 349 raise ValueError("time data %r does not match format %r" % 350 (data_string, format)) 351 if len(data_string) != found.end(): 352 raise ValueError("unconverted data remains: %s" % 353 data_string[found.end():])
ValueError: time data 'Financial Times' does not match format '%d %B %Y'
This is the full code that I use for extracting news from a set of files in a folder (which includes the news file attached):
import news_extract as ne import os, glob import pandas
directory = '.../FactivaDownloads/'
filenames_fc = []
for filename in glob.glob(os.path.join(directory, "*.txt")): print(filename) with open(os.path.join(os.getcwd(), filename), 'r') as f: filenames_fc += [filename]
fc_data = []
for n in range(0, len(filenames_fc)):
for n in range(0,2):
print(n+1) fc_file = filenames_fc[n] #file exported from Factiva fc_data += ne.factiva_extract(fc_file)
data=ne.fix_fac_fieldnames(fc_data) dataframe=ne.news_export(data,to_pandas=True, master_fields = [], jacc_threshold=1.1) dataframe.to_excel(directory + "News.xlsx")
I would appreciate your assistance for solving the issue. Thank you, Best, Birgul