AndyTheFactory / newspaper4k

📰 Newspaper4k a fork of the beloved Newspaper3k. Extraction of articles, titles, and metadata from news websites.
MIT License
430 stars 38 forks source link

Publish Date extraction using REGEX on the HTML + heuristics? #18

Open AndyTheFactory opened 10 months ago

AndyTheFactory commented 10 months ago

Issue by will3216 Wed Nov 4 18:48:57 2015 Originally opened as https://github.com/codelucas/newspaper/issues/168


In extractors.py:173 it says that the publish date is being parsed using regex + heuristics, but I don't really see it doing this work. Am I missing something/is this being added?

AndyTheFactory commented 10 months ago

Comment by bobcolner Sun Nov 29 06:44:04 2015


+1 for pub-date & author extraction not being reliable

AndyTheFactory commented 10 months ago

Comment by TwistingTwists Wed Dec 23 05:36:19 2015


+1 for pub-date and author extraction not being reliable

AndyTheFactory commented 10 months ago

Comment by bobcolner Sat Dec 26 08:54:57 2015


https://github.com/Webhose/article-date-extractor incorporating this new library would go a long way. I'm not a very good programer but I can get it working I will try to submit a pull request.

AndyTheFactory commented 10 months ago

Comment by will3216 Wed Dec 30 13:00:14 2015


Sooo, I have access to a bunch of news article pages and after looking at them, I made some improvements to the selectivity and the expansiveness of the work-flow. However, I did this in js (not my native language) and don't really feel like porting it to python (also not my native language), but I did want to share with you guys what I have come up with and I don't imagine it will be too hard to create the changes:


var utils = {
  getPublicationDate: function(){
    // 3 strategies for publishing date extraction. The strategies
    // are descending in accuracy and the next strategy is only
    // attempted if a preferred one fails.
    // 1. Pubdate from URL
    // 2. Pubdate from metadata
    // 3. Raw regex searches in the HTML + added heuristics (NOT IMPLEMENTED)
    var date_from_meta_tags = dateParser._scrapePublicationDateFromMetaTags();
    if (date_from_meta_tags != ''){
      return dateParser._formatDateString(date_from_meta_tags);
    }
    var date_from_url = dateParser._parseDateFromUrl(this.getOgUrl());
    if (date_from_url != ''){
      return dateParser._formatDateString(date_from_url);
    }
    return '';
  },
}

var dateParser = {

  _publishDateTags: function(){
    return [
      {'attribute': 'property',     'value': 'rnews:datePublished', 'content': 'content'},
      {'attribute': 'property',     'value': 'article:published_time', 'content': 'content'},
      {'attribute': 'name',         'value': 'OriginalPublicationDate', 'content': 'content'},
      {'attribute': 'itemprop',     'value': 'datePublished',           'content': 'datetime'},
      {'attribute': 'property',     'value': 'og:published_time',       'content': 'content'},
      {'attribute': 'name',         'value': 'article_date_original',   'content': 'content'},
      {'attribute': 'name',         'value': 'publication_date',        'content': 'content'},
      {'attribute': 'name',         'value': 'sailthru.date',           'content': 'content'},
      {'attribute': 'name',         'value': 'PublishDate',             'content': 'content'},
      {'attribute': 'property',     'value': 'publish_time',            'content': 'content'},
      {'attribute': 'name',         'value': 'publishdate',             'content': 'content'},
      {'attribute': 'property',     'value': 'bt:pubDate',              'content': 'content'},
      {'attribute': 'name',         'value': 'speare-timestamp',        'content':'content'},
      {'attribute': 'name',         'value': 'parsely-pub-date',        'content':'content'},
      {'attribute': 'itemprop',     'value': 'dateCreated',             'content':'content'}
    ];
  },

  _regexStrings: function(){
    return {
      'weekday_string': '(?:sun|mon|tue|wed|thu|fri|sat|sunday|monday|tuesday|thursday|friday|saturday|thurs|tues)',
      'month_string': '(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec|january|february|march|april|may|june|july|august|september|october|november|december)',
      'year': '(?:197[0-9]|198[0-9]|199[0-9]|200[0-9]|201[0-9]|202[0-9]|203[0-9])',
      'month': '(?:10|11|12|(?:0|)[1-9])',
      'day': '(?:1[0-9]|2[0-9]|30|31|[0]{0,1}[1-9])',
      'hour': '(?:(?:0|)[0-9]|1[0-9]|2[0-3])',
      'minute': '(?:0[0-9]|[1-5][0-9])',
      'second': '(?:0[0-9]|[1-5][0-9])',
      'offset_minute': '(?:00|30|45)',
      'offset_operator': '[+-]',
      'opt_slash': "\/?",
      'opt_slashes': "(?:\/|)+",
      'uri_base': '(?:http\:\/\/|https\:\/\/|)(?:[a-z0-9A-Z\-]+\.)+[A-Z0-9a-z\-]+\/'
    };
  },

  _monthStringToIntString: function(month_string){
    if(month_string.match(/jan|january/i))  { return '01'; }
    if(month_string.match(/feb|february/i)) { return '02'; }
    if(month_string.match(/mar|march/i))    { return '03'; }
    if(month_string.match(/apr|april/i))    { return '04'; }
    if(month_string.match(/may/i))          { return '05'; }
    if(month_string.match(/jun|june/i))     { return '06'; }
    if(month_string.match(/jul|july/i))     { return '07'; }
    if(month_string.match(/aug|august/i))   { return '08'; }
    if(month_string.match(/sep|september/i)){ return '09'; }
    if(month_string.match(/oct|october/i))  { return '10'; }
    if(month_string.match(/nov|november/i)) { return '11'; }
    if(month_string.match(/dec|december/i)) { return '12'; }
  },

  _scrapePublicationDateFromMetaTags: function(){
    var metaTags = document.getElementsByTagName("meta");
    for(var i = 0; i < this._publishDateTags().length; i++){
      var pubdate_meta_tags = this._publishDateTags()[i];
      var attr = pubdate_meta_tags['attribute'], val = pubdate_meta_tags['value'], content = pubdate_meta_tags['content'];

      for(var j = 0; j < metaTags.length; j++){
        if(metaTags[j].getAttribute(attr) === val){
          return metaTags[j].getAttribute(content);
        };
      }
    }
    return "";
  },

  _convertFormatDateStringMatchArray: function(match){
    var match_array_keys = ['fullmatch',            'day',           'month',     'year', 'month', 'day', 'hour', 'minute', 'second', 'offset_operator', 'offset_hour', 'offset_minute'];
    var defaults = ['1911-11-11T11:11:11+00:00',   undefined,       undefined,    '1911',  '11',   '11',  '11',    '11',     '11',           '+',           '00',            '00'];
    // Examples:
    // ["20150921", undefined, undefined, "2015", "09", "21", undefined, undefined, undefined, undefined, undefined, undefined]
    // ["2015-11-03 5:45", undefined, undefined, "2015", "11", "03", "5", "45", undefined, undefined, undefined, undefined]
    // ["2013-09-19T03:00:00Z", undefined, undefined, "2013", "09", "19", "03", "00", "00", undefined, undefined, undefined]
    // ["2014-06-23T09:47:25+00:00", undefined, undefined, "2014", "06", "23", "09", "47", "25", "+", "00", "00"]
    for(var i = 0; i < match_array_keys.length; i++){
      var key = match_array_keys[i];
      var val = match[i] || match[key] || defaults[i];
      match[key] = val;
      if(key === 'month' && match.month && match.month.match(new RegExp(this._regexStrings()['month_string'], 'i'))) {
        match.month = this._monthStringToIntString(match.month);
      }
      var ambiguous_length_keys = ['month', 'day', 'hour', 'offset_hour'];
      if(ambiguous_length_keys.includes(key) && match[key] && match[key].length === 1){
        match[key] = ('0' + match[i]);
      }

    }

    var date = match.year + '-' + match.month + '-' + match.day;
    var time = match.hour + ':' + match.minute + ':' + match.second;
    var offset = match.offset_operator + match.offset_hour + ':' + match.offset_minute;
    var datetime = date + 'T' + time + offset;
    return datetime;
  },

  _dateStringRegexes: function(){
    var opt_colon = '(?:\:|)';
    var regexes = this._regexStrings();
    var time_re_str   = '('+regexes['hour']+')'+opt_colon+'('+regexes['minute']+')(?:'+opt_colon+'(?:('+regexes['second']+')|))';
    var offset_re_str = '(?:(?:('+regexes['offset_operator']+')('+regexes['hour']+')'+opt_colon+'(?:('+regexes['offset_minute']+'))|Z)|)';

    var date_re_str_1   = '('+regexes['year']+')(?:[\-\/]|)('+regexes['month']+'|'+regexes['month_string']+')(?:[\-\/]|)('+regexes['day']+')';
    var re_str_1 = '()()'+date_re_str_1+'(?:(?:T|\\s|)'+time_re_str+offset_re_str+'|)';
    //            empty captures to standardize resulting match array
    var re_1 = new RegExp(re_str_1, 'i');

    var date_re_str_2 = regexes['weekday_string']+'\,\\s('+regexes['day']+')\\s('+regexes['month_string']+')\\s('+regexes['year']+')';
    var re_str_2 = date_re_str_2+'()()'+'\\s'+time_re_str+'\\s'+offset_re_str;
    //                     empty captures to standardize resulting match array
    var re_2 = new RegExp(re_str_2, 'i');

    var regexes = [re_1, re_2];
    return regexes;
  },

  _formatDateString: function(rawDateString){
    var _datetimeRegExps = this._dateStringRegexes();
    for(var i = 0; i < _datetimeRegExps.length; i++){
      var re = _datetimeRegExps[i];
      var match = rawDateString.match(re);
      if (match) {
        return this._convertFormatDateStringMatchArray(match);
      }
    }
    return '';
  },

  _publishDateUrlPatterns: function(){
    var regexes = this._regexStrings();
    var month_matcher = "(?:"+regexes['month_string']+"|"+regexes['month']+")";
    var path = "[a-z][\.\-a-z]*"
    var date_matcher = regexes['year']+regexes['opt_slash']+month_matcher+regexes['opt_slash']+regexes['day'];
    var date_from_url_regexes = [
      ["^(?:https?\:\/\/|https\:\/\/|)(?:www\.|)[^\/]*(?:\/|\/+)(?:apps\/pbcs\.dll\/article\\?AID\=(?:\/|\%2f)|article\/(?:zz\/|))("+date_matcher+")(?:\/|\%2f)\.*(?:\/|\%2f)[0-9]{7,8}[0-9]{0,1}", "i"],
      ["^(?:http\:\/\/|https\:\/\/|)(?:www\.|)[^\/]*(?:\/|\/+)(?:"+path+"\/|)*("+regexes['year']+"\/"+month_matcher+"(?:\/"+regexes['day']+"|))\/.*", "i"],
      ["^(?:http\:\/\/|https\:\/\/|)(?:www\.|)[^\/]*(?:\/|\/+)(?:"+path+"\/|)*"+path+"\-("+date_matcher+")(?:\-[0-9]{10}\-[0-9]{2}|)(?:\/[0-9]|)$", "i"]
    ];
    return date_from_url_regexes;
  },

  _convertMatchedDateStringsToDate: function(year, month, day){
    if (!day){day = '1'}
    var date = Date.parse(year+"/"+month+"/"+day);
    if (this._validPublicationDate(date)) {
      return date;
    }else{
      return '';
    }
  },

  _validPublicationDate: function(parsed_date_epoch){
    return (0 < parsed_date_epoch && parsed_date_epoch < Date.parse("2030/12/31"));
  },

  _parseDateFromUrl: function(url){
    var regexes = this._publishDateUrlPatterns();
    for(var i = 0; i < regexes.length; i++){
      var re = new RegExp(regexes[i][0], regexes[i][1]);
      // return re;
      var match = url.match(re);
      // return match;
      if(match && match.length === 2) {
        return match[1];
      }
    }
    return '';
  },

};

I get that this isn't ideal and I haven't even looked at your suggestion yet @bobcolner, but just thought I'd share before I forgot!

PS: I plan on packaging this up into a js module once our js developer gets back from his vacation and can make all of this not terrible!

AndyTheFactory commented 10 months ago

Comment by eromoe Fri Jan 15 08:08:54 2016


I think the heuristics would be:

  1. convert html to plain text , also convert all date liked string to a common format
  2. use a trained model to process the text(a model need someone provide the training data to train)
  3. model weight all date string, then get the highest score.

If this is the heuristics approach, it does need some effort.