codelucas / newspaper

newspaper3k is a news, full-text, and article metadata extraction in Python 3. Advanced docs:
https://goo.gl/VX41yK
MIT License
14.18k stars 2.12k forks source link

Publish Date extraction using REGEX on the HTML + heuristics? #168

Open will3216 opened 9 years ago

will3216 commented 9 years ago

In extractors.py:173 it says that the publish date is being parsed using regex + heuristics, but I don't really see it doing this work. Am I missing something/is this being added?

bobcolner commented 8 years ago

+1 for pub-date & author extraction not being reliable

TwistingTwists commented 8 years ago

+1 for pub-date and author extraction not being reliable

bobcolner commented 8 years ago

https://github.com/Webhose/article-date-extractor incorporating this new library would go a long way. I'm not a very good programer but I can get it working I will try to submit a pull request.

will3216 commented 8 years ago

Sooo, I have access to a bunch of news article pages and after looking at them, I made some improvements to the selectivity and the expansiveness of the work-flow. However, I did this in js (not my native language) and don't really feel like porting it to python (also not my native language), but I did want to share with you guys what I have come up with and I don't imagine it will be too hard to create the changes:


var utils = {
  getPublicationDate: function(){
    // 3 strategies for publishing date extraction. The strategies
    // are descending in accuracy and the next strategy is only
    // attempted if a preferred one fails.
    // 1. Pubdate from URL
    // 2. Pubdate from metadata
    // 3. Raw regex searches in the HTML + added heuristics (NOT IMPLEMENTED)
    var date_from_meta_tags = dateParser._scrapePublicationDateFromMetaTags();
    if (date_from_meta_tags != ''){
      return dateParser._formatDateString(date_from_meta_tags);
    }
    var date_from_url = dateParser._parseDateFromUrl(this.getOgUrl());
    if (date_from_url != ''){
      return dateParser._formatDateString(date_from_url);
    }
    return '';
  },
}

var dateParser = {

  _publishDateTags: function(){
    return [
      {'attribute': 'property',     'value': 'rnews:datePublished', 'content': 'content'},
      {'attribute': 'property',     'value': 'article:published_time', 'content': 'content'},
      {'attribute': 'name',         'value': 'OriginalPublicationDate', 'content': 'content'},
      {'attribute': 'itemprop',     'value': 'datePublished',           'content': 'datetime'},
      {'attribute': 'property',     'value': 'og:published_time',       'content': 'content'},
      {'attribute': 'name',         'value': 'article_date_original',   'content': 'content'},
      {'attribute': 'name',         'value': 'publication_date',        'content': 'content'},
      {'attribute': 'name',         'value': 'sailthru.date',           'content': 'content'},
      {'attribute': 'name',         'value': 'PublishDate',             'content': 'content'},
      {'attribute': 'property',     'value': 'publish_time',            'content': 'content'},
      {'attribute': 'name',         'value': 'publishdate',             'content': 'content'},
      {'attribute': 'property',     'value': 'bt:pubDate',              'content': 'content'},
      {'attribute': 'name',         'value': 'speare-timestamp',        'content':'content'},
      {'attribute': 'name',         'value': 'parsely-pub-date',        'content':'content'},
      {'attribute': 'itemprop',     'value': 'dateCreated',             'content':'content'}
    ];
  },

  _regexStrings: function(){
    return {
      'weekday_string': '(?:sun|mon|tue|wed|thu|fri|sat|sunday|monday|tuesday|thursday|friday|saturday|thurs|tues)',
      'month_string': '(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|january|february|march|april|may|june|july|august|september|october|november|december)',
      'year': '(?:197[0-9]|198[0-9]|199[0-9]|200[0-9]|201[0-9]|202[0-9]|203[0-9])',
      'month': '(?:10|11|12|(?:0|)[1-9])',
      'day': '(?:1[0-9]|2[0-9]|30|31|[0]{0,1}[1-9])',
      'hour': '(?:(?:0|)[0-9]|1[0-9]|2[0-3])',
      'minute': '(?:0[0-9]|[1-5][0-9])',
      'second': '(?:0[0-9]|[1-5][0-9])',
      'offset_minute': '(?:00|30|45)',
      'offset_operator': '[+-]',
      'opt_slash': "\/?",
      'opt_slashes': "(?:\/|)+",
      'uri_base': '(?:http\:\/\/|https\:\/\/|)(?:[a-z0-9A-Z\-]+\.)+[A-Z0-9a-z\-]+\/'
    };
  },

  _monthStringToIntString: function(month_string){
    if(month_string.match(/jan|january/i))  { return '01'; }
    if(month_string.match(/feb|february/i)) { return '02'; }
    if(month_string.match(/mar|march/i))    { return '03'; }
    if(month_string.match(/apr|april/i))    { return '04'; }
    if(month_string.match(/may/i))          { return '05'; }
    if(month_string.match(/jun|june/i))     { return '06'; }
    if(month_string.match(/jul|july/i))     { return '07'; }
    if(month_string.match(/aug|august/i))   { return '08'; }
    if(month_string.match(/sep|september/i)){ return '09'; }
    if(month_string.match(/oct|october/i))  { return '10'; }
    if(month_string.match(/nov|november/i)) { return '11'; }
    if(month_string.match(/dec|december/i)) { return '12'; }
  },

  _scrapePublicationDateFromMetaTags: function(){
    var metaTags = document.getElementsByTagName("meta");
    for(var i = 0; i < this._publishDateTags().length; i++){
      var pubdate_meta_tags = this._publishDateTags()[i];
      var attr = pubdate_meta_tags['attribute'], val = pubdate_meta_tags['value'], content = pubdate_meta_tags['content'];

      for(var j = 0; j < metaTags.length; j++){
        if(metaTags[j].getAttribute(attr) === val){
          return metaTags[j].getAttribute(content);
        };
      }
    }
    return "";
  },

  _convertFormatDateStringMatchArray: function(match){
    var match_array_keys = ['fullmatch',            'day',           'month',     'year', 'month', 'day', 'hour', 'minute', 'second', 'offset_operator', 'offset_hour', 'offset_minute'];
    var defaults = ['1911-11-11T11:11:11+00:00',   undefined,       undefined,    '1911',  '11',   '11',  '11',    '11',     '11',           '+',           '00',            '00'];
    // Examples:
    // ["20150921", undefined, undefined, "2015", "09", "21", undefined, undefined, undefined, undefined, undefined, undefined]
    // ["2015-11-03 5:45", undefined, undefined, "2015", "11", "03", "5", "45", undefined, undefined, undefined, undefined]
    // ["2013-09-19T03:00:00Z", undefined, undefined, "2013", "09", "19", "03", "00", "00", undefined, undefined, undefined]
    // ["2014-06-23T09:47:25+00:00", undefined, undefined, "2014", "06", "23", "09", "47", "25", "+", "00", "00"]
    for(var i = 0; i < match_array_keys.length; i++){
      var key = match_array_keys[i];
      var val = match[i] || match[key] || defaults[i];
      match[key] = val;
      if(key === 'month' && match.month && match.month.match(new RegExp(this._regexStrings()['month_string'], 'i'))) {
        match.month = this._monthStringToIntString(match.month);
      }
      var ambiguous_length_keys = ['month', 'day', 'hour', 'offset_hour'];
      if(ambiguous_length_keys.includes(key) && match[key] && match[key].length === 1){
        match[key] = ('0' + match[i]);
      }

    }

    var date = match.year + '-' + match.month + '-' + match.day;
    var time = match.hour + ':' + match.minute + ':' + match.second;
    var offset = match.offset_operator + match.offset_hour + ':' + match.offset_minute;
    var datetime = date + 'T' + time + offset;
    return datetime;
  },

  _dateStringRegexes: function(){
    var opt_colon = '(?:\:|)';
    var regexes = this._regexStrings();
    var time_re_str   = '('+regexes['hour']+')'+opt_colon+'('+regexes['minute']+')(?:'+opt_colon+'(?:('+regexes['second']+')|))';
    var offset_re_str = '(?:(?:('+regexes['offset_operator']+')('+regexes['hour']+')'+opt_colon+'(?:('+regexes['offset_minute']+'))|Z)|)';

    var date_re_str_1   = '('+regexes['year']+')(?:[\-\/]|)('+regexes['month']+'|'+regexes['month_string']+')(?:[\-\/]|)('+regexes['day']+')';
    var re_str_1 = '()()'+date_re_str_1+'(?:(?:T|\\s|)'+time_re_str+offset_re_str+'|)';
    //            empty captures to standardize resulting match array
    var re_1 = new RegExp(re_str_1, 'i');

    var date_re_str_2 = regexes['weekday_string']+'\,\\s('+regexes['day']+')\\s('+regexes['month_string']+')\\s('+regexes['year']+')';
    var re_str_2 = date_re_str_2+'()()'+'\\s'+time_re_str+'\\s'+offset_re_str;
    //                     empty captures to standardize resulting match array
    var re_2 = new RegExp(re_str_2, 'i');

    var regexes = [re_1, re_2];
    return regexes;
  },

  _formatDateString: function(rawDateString){
    var _datetimeRegExps = this._dateStringRegexes();
    for(var i = 0; i < _datetimeRegExps.length; i++){
      var re = _datetimeRegExps[i];
      var match = rawDateString.match(re);
      if (match) {
        return this._convertFormatDateStringMatchArray(match);
      }
    }
    return '';
  },

  _publishDateUrlPatterns: function(){
    var regexes = this._regexStrings();
    var month_matcher = "(?:"+regexes['month_string']+"|"+regexes['month']+")";
    var path = "[a-z][\.\-a-z]*"
    var date_matcher = regexes['year']+regexes['opt_slash']+month_matcher+regexes['opt_slash']+regexes['day'];
    var date_from_url_regexes = [
      ["^(?:https?\:\/\/|https\:\/\/|)(?:www\.|)[^\/]*(?:\/|\/+)(?:apps\/pbcs\.dll\/article\\?AID\=(?:\/|\%2f)|article\/(?:zz\/|))("+date_matcher+")(?:\/|\%2f)\.*(?:\/|\%2f)[0-9]{7,8}[0-9]{0,1}", "i"],
      ["^(?:http\:\/\/|https\:\/\/|)(?:www\.|)[^\/]*(?:\/|\/+)(?:"+path+"\/|)*("+regexes['year']+"\/"+month_matcher+"(?:\/"+regexes['day']+"|))\/.*", "i"],
      ["^(?:http\:\/\/|https\:\/\/|)(?:www\.|)[^\/]*(?:\/|\/+)(?:"+path+"\/|)*"+path+"\-("+date_matcher+")(?:\-[0-9]{10}\-[0-9]{2}|)(?:\/[0-9]|)$", "i"]
    ];
    return date_from_url_regexes;
  },

  _convertMatchedDateStringsToDate: function(year, month, day){
    if (!day){day = '1'}
    var date = Date.parse(year+"/"+month+"/"+day);
    if (this._validPublicationDate(date)) {
      return date;
    }else{
      return '';
    }
  },

  _validPublicationDate: function(parsed_date_epoch){
    return (0 < parsed_date_epoch && parsed_date_epoch < Date.parse("2030/12/31"));
  },

  _parseDateFromUrl: function(url){
    var regexes = this._publishDateUrlPatterns();
    for(var i = 0; i < regexes.length; i++){
      var re = new RegExp(regexes[i][0], regexes[i][1]);
      // return re;
      var match = url.match(re);
      // return match;
      if(match && match.length === 2) {
        return match[1];
      }
    }
    return '';
  },

};

I get that this isn't ideal and I haven't even looked at your suggestion yet @bobcolner, but just thought I'd share before I forgot!

PS: I plan on packaging this up into a js module once our js developer gets back from his vacation and can make all of this not terrible!

eromoe commented 8 years ago

I think the heuristics would be:

  1. convert html to plain text , also convert all date liked string to a common format
  2. use a trained model to process the text(a model need someone provide the training data to train)
  3. model weight all date string, then get the highest score.

If this is the heuristics approach, it does need some effort.