Open will3216 opened 9 years ago
+1 for pub-date & author extraction not being reliable
+1 for pub-date and author extraction not being reliable
https://github.com/Webhose/article-date-extractor incorporating this new library would go a long way. I'm not a very good programer but I can get it working I will try to submit a pull request.
Sooo, I have access to a bunch of news article pages and after looking at them, I made some improvements to the selectivity and the expansiveness of the work-flow. However, I did this in js (not my native language) and don't really feel like porting it to python (also not my native language), but I did want to share with you guys what I have come up with and I don't imagine it will be too hard to create the changes:
var utils = {
getPublicationDate: function(){
// 3 strategies for publishing date extraction. The strategies
// are descending in accuracy and the next strategy is only
// attempted if a preferred one fails.
// 1. Pubdate from URL
// 2. Pubdate from metadata
// 3. Raw regex searches in the HTML + added heuristics (NOT IMPLEMENTED)
var date_from_meta_tags = dateParser._scrapePublicationDateFromMetaTags();
if (date_from_meta_tags != ''){
return dateParser._formatDateString(date_from_meta_tags);
}
var date_from_url = dateParser._parseDateFromUrl(this.getOgUrl());
if (date_from_url != ''){
return dateParser._formatDateString(date_from_url);
}
return '';
},
}
var dateParser = {
_publishDateTags: function(){
return [
{'attribute': 'property', 'value': 'rnews:datePublished', 'content': 'content'},
{'attribute': 'property', 'value': 'article:published_time', 'content': 'content'},
{'attribute': 'name', 'value': 'OriginalPublicationDate', 'content': 'content'},
{'attribute': 'itemprop', 'value': 'datePublished', 'content': 'datetime'},
{'attribute': 'property', 'value': 'og:published_time', 'content': 'content'},
{'attribute': 'name', 'value': 'article_date_original', 'content': 'content'},
{'attribute': 'name', 'value': 'publication_date', 'content': 'content'},
{'attribute': 'name', 'value': 'sailthru.date', 'content': 'content'},
{'attribute': 'name', 'value': 'PublishDate', 'content': 'content'},
{'attribute': 'property', 'value': 'publish_time', 'content': 'content'},
{'attribute': 'name', 'value': 'publishdate', 'content': 'content'},
{'attribute': 'property', 'value': 'bt:pubDate', 'content': 'content'},
{'attribute': 'name', 'value': 'speare-timestamp', 'content':'content'},
{'attribute': 'name', 'value': 'parsely-pub-date', 'content':'content'},
{'attribute': 'itemprop', 'value': 'dateCreated', 'content':'content'}
];
},
_regexStrings: function(){
return {
'weekday_string': '(?:sun|mon|tue|wed|thu|fri|sat|sunday|monday|tuesday|thursday|friday|saturday|thurs|tues)',
'month_string': '(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|january|february|march|april|may|june|july|august|september|october|november|december)',
'year': '(?:197[0-9]|198[0-9]|199[0-9]|200[0-9]|201[0-9]|202[0-9]|203[0-9])',
'month': '(?:10|11|12|(?:0|)[1-9])',
'day': '(?:1[0-9]|2[0-9]|30|31|[0]{0,1}[1-9])',
'hour': '(?:(?:0|)[0-9]|1[0-9]|2[0-3])',
'minute': '(?:0[0-9]|[1-5][0-9])',
'second': '(?:0[0-9]|[1-5][0-9])',
'offset_minute': '(?:00|30|45)',
'offset_operator': '[+-]',
'opt_slash': "\/?",
'opt_slashes': "(?:\/|)+",
'uri_base': '(?:http\:\/\/|https\:\/\/|)(?:[a-z0-9A-Z\-]+\.)+[A-Z0-9a-z\-]+\/'
};
},
_monthStringToIntString: function(month_string){
if(month_string.match(/jan|january/i)) { return '01'; }
if(month_string.match(/feb|february/i)) { return '02'; }
if(month_string.match(/mar|march/i)) { return '03'; }
if(month_string.match(/apr|april/i)) { return '04'; }
if(month_string.match(/may/i)) { return '05'; }
if(month_string.match(/jun|june/i)) { return '06'; }
if(month_string.match(/jul|july/i)) { return '07'; }
if(month_string.match(/aug|august/i)) { return '08'; }
if(month_string.match(/sep|september/i)){ return '09'; }
if(month_string.match(/oct|october/i)) { return '10'; }
if(month_string.match(/nov|november/i)) { return '11'; }
if(month_string.match(/dec|december/i)) { return '12'; }
},
_scrapePublicationDateFromMetaTags: function(){
var metaTags = document.getElementsByTagName("meta");
for(var i = 0; i < this._publishDateTags().length; i++){
var pubdate_meta_tags = this._publishDateTags()[i];
var attr = pubdate_meta_tags['attribute'], val = pubdate_meta_tags['value'], content = pubdate_meta_tags['content'];
for(var j = 0; j < metaTags.length; j++){
if(metaTags[j].getAttribute(attr) === val){
return metaTags[j].getAttribute(content);
};
}
}
return "";
},
_convertFormatDateStringMatchArray: function(match){
var match_array_keys = ['fullmatch', 'day', 'month', 'year', 'month', 'day', 'hour', 'minute', 'second', 'offset_operator', 'offset_hour', 'offset_minute'];
var defaults = ['1911-11-11T11:11:11+00:00', undefined, undefined, '1911', '11', '11', '11', '11', '11', '+', '00', '00'];
// Examples:
// ["20150921", undefined, undefined, "2015", "09", "21", undefined, undefined, undefined, undefined, undefined, undefined]
// ["2015-11-03 5:45", undefined, undefined, "2015", "11", "03", "5", "45", undefined, undefined, undefined, undefined]
// ["2013-09-19T03:00:00Z", undefined, undefined, "2013", "09", "19", "03", "00", "00", undefined, undefined, undefined]
// ["2014-06-23T09:47:25+00:00", undefined, undefined, "2014", "06", "23", "09", "47", "25", "+", "00", "00"]
for(var i = 0; i < match_array_keys.length; i++){
var key = match_array_keys[i];
var val = match[i] || match[key] || defaults[i];
match[key] = val;
if(key === 'month' && match.month && match.month.match(new RegExp(this._regexStrings()['month_string'], 'i'))) {
match.month = this._monthStringToIntString(match.month);
}
var ambiguous_length_keys = ['month', 'day', 'hour', 'offset_hour'];
if(ambiguous_length_keys.includes(key) && match[key] && match[key].length === 1){
match[key] = ('0' + match[i]);
}
}
var date = match.year + '-' + match.month + '-' + match.day;
var time = match.hour + ':' + match.minute + ':' + match.second;
var offset = match.offset_operator + match.offset_hour + ':' + match.offset_minute;
var datetime = date + 'T' + time + offset;
return datetime;
},
_dateStringRegexes: function(){
var opt_colon = '(?:\:|)';
var regexes = this._regexStrings();
var time_re_str = '('+regexes['hour']+')'+opt_colon+'('+regexes['minute']+')(?:'+opt_colon+'(?:('+regexes['second']+')|))';
var offset_re_str = '(?:(?:('+regexes['offset_operator']+')('+regexes['hour']+')'+opt_colon+'(?:('+regexes['offset_minute']+'))|Z)|)';
var date_re_str_1 = '('+regexes['year']+')(?:[\-\/]|)('+regexes['month']+'|'+regexes['month_string']+')(?:[\-\/]|)('+regexes['day']+')';
var re_str_1 = '()()'+date_re_str_1+'(?:(?:T|\\s|)'+time_re_str+offset_re_str+'|)';
// empty captures to standardize resulting match array
var re_1 = new RegExp(re_str_1, 'i');
var date_re_str_2 = regexes['weekday_string']+'\,\\s('+regexes['day']+')\\s('+regexes['month_string']+')\\s('+regexes['year']+')';
var re_str_2 = date_re_str_2+'()()'+'\\s'+time_re_str+'\\s'+offset_re_str;
// empty captures to standardize resulting match array
var re_2 = new RegExp(re_str_2, 'i');
var regexes = [re_1, re_2];
return regexes;
},
_formatDateString: function(rawDateString){
var _datetimeRegExps = this._dateStringRegexes();
for(var i = 0; i < _datetimeRegExps.length; i++){
var re = _datetimeRegExps[i];
var match = rawDateString.match(re);
if (match) {
return this._convertFormatDateStringMatchArray(match);
}
}
return '';
},
_publishDateUrlPatterns: function(){
var regexes = this._regexStrings();
var month_matcher = "(?:"+regexes['month_string']+"|"+regexes['month']+")";
var path = "[a-z][\.\-a-z]*"
var date_matcher = regexes['year']+regexes['opt_slash']+month_matcher+regexes['opt_slash']+regexes['day'];
var date_from_url_regexes = [
["^(?:https?\:\/\/|https\:\/\/|)(?:www\.|)[^\/]*(?:\/|\/+)(?:apps\/pbcs\.dll\/article\\?AID\=(?:\/|\%2f)|article\/(?:zz\/|))("+date_matcher+")(?:\/|\%2f)\.*(?:\/|\%2f)[0-9]{7,8}[0-9]{0,1}", "i"],
["^(?:http\:\/\/|https\:\/\/|)(?:www\.|)[^\/]*(?:\/|\/+)(?:"+path+"\/|)*("+regexes['year']+"\/"+month_matcher+"(?:\/"+regexes['day']+"|))\/.*", "i"],
["^(?:http\:\/\/|https\:\/\/|)(?:www\.|)[^\/]*(?:\/|\/+)(?:"+path+"\/|)*"+path+"\-("+date_matcher+")(?:\-[0-9]{10}\-[0-9]{2}|)(?:\/[0-9]|)$", "i"]
];
return date_from_url_regexes;
},
_convertMatchedDateStringsToDate: function(year, month, day){
if (!day){day = '1'}
var date = Date.parse(year+"/"+month+"/"+day);
if (this._validPublicationDate(date)) {
return date;
}else{
return '';
}
},
_validPublicationDate: function(parsed_date_epoch){
return (0 < parsed_date_epoch && parsed_date_epoch < Date.parse("2030/12/31"));
},
_parseDateFromUrl: function(url){
var regexes = this._publishDateUrlPatterns();
for(var i = 0; i < regexes.length; i++){
var re = new RegExp(regexes[i][0], regexes[i][1]);
// return re;
var match = url.match(re);
// return match;
if(match && match.length === 2) {
return match[1];
}
}
return '';
},
};
I get that this isn't ideal and I haven't even looked at your suggestion yet @bobcolner, but just thought I'd share before I forgot!
PS: I plan on packaging this up into a js module once our js developer gets back from his vacation and can make all of this not terrible!
I think the heuristics
would be:
If this is the heuristics
approach, it does need some effort.
In extractors.py:173 it says that the publish date is being parsed using regex + heuristics, but I don't really see it doing this work. Am I missing something/is this being added?