Open chrisribe opened 9 years ago
Well some progress added carriage returns on html </p>
and <br>
so the algorithm knows the text separations... Not perfect but better. Any ideas?
var needle = require('needle');
var read = require('node-readability');
var sanitizeHtml = require('sanitize-html');
var SummaryTool = require('node-summary');
var url = " http://www.inc.com/gene-marks/the-one-way-to-tell-if-you-re-a-successful-entrepreneur.html?cid=sf01001";
needle.get(url, function(error, response) {
if (!error && response.statusCode == 200){
read(response.body, function(err, article, meta) {
var str = article.content.replace(/<\/p>/g, '<\/p>\n\n');
str = str.replace(/<br>/g, '<br>\n');
str = str.replace(/<br\/>/g, '<br\/>\n');
str = str.replace(/<br \/>/g, '<br \/>\n');
var cleanCont = sanitizeHtml(str, {
allowedTags: [ ],
allowedAttributes: {}
});
SummaryTool.summarize(article.title, cleanCont, function(err, summary) {
if(err) console.log("Something went wrong man!");
console.log(summary);
console.log("Original Length " + (article.title.length + cleanCont.length));
console.log("Summary Length " + summary.length);
console.log("Summary Ratio: " + (100 - (100 * (summary.length / (article.title.length + cleanCont.length)))));
});
});
}
});
@chrisribe I'm getting the same. I will write a good couple of lines for content and try to summarize it but I only get like a line back? The example returned shows loads of lines but summarizes fine then? How can we get it to summarize better?
Never got it to work exactly as I wanted it. Moved to another project and put this aside so...
Sorry. Chris
Hi, I am trying to get a summary text for a given webpage but the returned summary is short / bad and I cant figure out why....
Here are my steps: 1) Use needle to get the url html data 2) Run the data through readability so only the core of the html is extracted. 3) Sanitize the html to only return the text. 4) Run summarize...
Why am I getting one or two lines of summary ? Can I control how long the summary output should be? I want the output to be something like http://smmry.com
Here is the code:
Thanks Chris