Open arlyxiao opened 3 years ago
module ContentExtractor def get_clean_text(dom) # remove html comment html = dom.clone html.gsub!(/<!--.*?(.|\n)*?-->/, "\n") # remove javascript html.gsub!(%r{<script.*?>.*?(.|\n)*?</script>}, "\n") # remove a html.gsub!(%r{<a[\t|\s|\n|\r|\f].*?>.*?</a>}, '') # remove css html.gsub!(%r{<style.*?>.*?(.|\n)*?</style>}, "\n") # remove tag html.gsub!(/<.*?(.|\n)*?>/, '') replace_special_char(html) end def replace_special_char(str) str.gsub!('•', '·') str.gsub!('&', '&') str.gsub!(' ', ' ') str.gsub!('©', '@') str.gsub!("\r\n|\r", "\n") str end def line_block_distribute(lines) block_width = 3 block_distribution = [] index_distribution = lines.map(&:length) (0...(lines.length - block_width + 1)).each do |i| word_num = 0 (0...block_width).each do |j| word_num += index_distribution[i + j] end block_distribution[i] = word_num end block_distribution end def find_surge(block_distribution, start, threshold = 86) ((start + 1)...block_distribution.length - 3).each do |index| if block_distribution[index] > threshold && ( block_distribution[index + 1] > 0 || block_distribution[index + 2] > 0 || block_distribution[index + 3] > 0) return index end end -1 end def find_dive(block_distribution, surge_point) ((surge_point + 1)...(block_distribution.size - 2)).each do |index| if block_distribution[index].zero? && block_distribution[index + 1].zero? return index - 1 end end block_distribution.size - 1 end def get_content_block(block_distribution, to_line) threshold = 86 from_line = find_surge(block_distribution, to_line, threshold) to_line = find_dive(block_distribution, from_line) [from_line, to_line] end def get_content(lines, block_distribution) from_line = to_line = 0 content = [] loop do from_line, to_line = get_content_block(block_distribution, to_line) content += lines[from_line..to_line] break if from_line < 0 end content.join("\n") end def get_content_by_tag(html, block_content) doc = Nokogiri::HTML(html) p_doms = doc.css('p') ptext = [] p_doms.each do |p_dom| ptext << p_dom.parent if block_content.include?(p_dom.text) end max_p = ptext.max_by { |i| ptext.count(i) } get_clean_text(max_p.to_s).split("\n").map(&:strip).join( "\n" ).squeeze end end