arlyxiao / best-practice

1 stars 0 forks source link

Parse HTML text #18

Open arlyxiao opened 3 years ago

arlyxiao commented 3 years ago
module ContentExtractor
  def get_clean_text(dom)
    # remove html comment
    html = dom.clone
    html.gsub!(/<!--.*?(.|\n)*?-->/, "\n")
    # remove javascript
    html.gsub!(%r{<script.*?>.*?(.|\n)*?</script>}, "\n")
    # remove a
    html.gsub!(%r{<a[\t|\s|\n|\r|\f].*?>.*?</a>}, '')
    # remove css
    html.gsub!(%r{<style.*?>.*?(.|\n)*?</style>}, "\n")
    # remove tag
    html.gsub!(/<.*?(.|\n)*?>/, '')
    replace_special_char(html)
  end

  def replace_special_char(str)
    str.gsub!('&#8226;', '·')
    str.gsub!('&amp;', '&')
    str.gsub!('&nbsp;', ' ')
    str.gsub!('&copy;', '@')
    str.gsub!("\r\n|\r", "\n")
    str
  end

  def line_block_distribute(lines)
    block_width = 3
    block_distribution = []
    index_distribution = lines.map(&:length)
    (0...(lines.length - block_width + 1)).each do |i|
      word_num = 0
      (0...block_width).each do |j|
        word_num += index_distribution[i + j]
      end
      block_distribution[i] = word_num
    end
    block_distribution
  end

  def find_surge(block_distribution, start, threshold = 86)
    ((start + 1)...block_distribution.length - 3).each do |index|
      if block_distribution[index] > threshold && (
      block_distribution[index + 1] > 0 ||
          block_distribution[index + 2] > 0 ||
          block_distribution[index + 3] > 0)
        return index
      end
    end
    -1
  end

  def find_dive(block_distribution, surge_point)
    ((surge_point + 1)...(block_distribution.size - 2)).each do |index|
      if block_distribution[index].zero? &&
          block_distribution[index + 1].zero?
        return index - 1
      end
    end
    block_distribution.size - 1
  end

  def get_content_block(block_distribution, to_line)
    threshold = 86
    from_line = find_surge(block_distribution, to_line, threshold)
    to_line = find_dive(block_distribution, from_line)
    [from_line, to_line]
  end

  def get_content(lines, block_distribution)
    from_line = to_line = 0
    content = []
    loop do
      from_line, to_line = get_content_block(block_distribution, to_line)
      content += lines[from_line..to_line]
      break if from_line < 0
    end

    content.join("\n")
  end

  def get_content_by_tag(html, block_content)
    doc =  Nokogiri::HTML(html)
    p_doms = doc.css('p')
    ptext = []
    p_doms.each do |p_dom|
      ptext << p_dom.parent if block_content.include?(p_dom.text)
    end
    max_p = ptext.max_by { |i| ptext.count(i) }
    get_clean_text(max_p.to_s).split("\n").map(&:strip).join(
        "\n"
    ).squeeze
  end
end