Closed TroyDanielFZ closed 3 years ago
i dont plan on doing that within rssowlnix. doing this external is the better and more flexible way see https://github.com/Xyrio/RSSOwlnix/wiki/FAQ#item_getrss
i dont plan on doing that within rssowlnix. doing this external is the better and more flexible way see https://github.com/Xyrio/RSSOwlnix/wiki/FAQ#item_getrss
Thanks for replying, I'll have a try.
//=============================================================================
// FileName: RssGen.js
// Desc: 从各种网页扒取RSS
// Author: Troy Daniel
// Email: Troy_Daniel@163.com
// HomePage: https://www.cnblogs.com/troy-daniel
// Version: 0.0.2
// LastChange: 2021-07-05 20:23:30
// History:
// V0.0.1 2021-07-05
// 现在还是比较丑陋的,但是基本可用了
// V0.0.2 2021-07-05
// 哈哈哈哈哈哈哈哈,改了逻辑,,现在就比较优美了
// Usage:
// This programme is aimed to enhance the RSSowlnix, generating RSS for
// websites that doesn't provide one.
// I don't want to maintain multiple files for self generated RSS, so I use
// the 'categories ' to identify different websites/webpages.
// Before use this script, you should install nodejs at https://nodejs.org/en/
// and then running the following commands:
// npm -g install rss
// npm -g install request
// npm -g install cheerio
// npm -g install async
// and add/change entries in the variable 'rules', the key is the url, and the
// value is a function that accepts the request body, and returns an array of
// objects, which can be add as an entry to the RSS. See the following 'rules'
// for example.
//=============================================================================
const fs = require('fs');
var RSS = require('rss');
const request = require('request');
const cheerio = require('cheerio');
const async = require('async');
// const http = require('http');
// const { title } = require('process');
// const { url } = require('inspector');
// const { time } = require('console');
// const { callbackify } = require('util');
// const { from } = require('form-data');
var feed = new RSS({
title : "TD",
});
rules = {
'http://forum.httrack.com/' : function(body){
$ = cheerio.load(body);
return $('#pageContent td tr+ tr td').toArray().map((elem)=>{
var item = $(elem);
return {
title: $('a.s', item).text(),
published: $('td.d', item).text(),
url: 'http://forum.httrack.com' + $('a.s', item).attr('href'),
categories: ['httrack forum']
};
});
},
'https://www.journals.elsevier.com/automatica/recent-articles': function(body){
$ = cheerio.load(body);
return $('.e19zbck30').toArray().map((elem)=>{
var item = $(elem);
return {
title: $('h2', item).text(),
published: $('td.d', item).text(),
url: $('article>a', item).attr('href'),
author: $('.jhp-css-48d2gn', item).text(),
categories: ['automatica']
};
});
},
'https://link.springer.com/search?query=&search-within=Journal&facet-journal-id=11071': function(body){
$ = cheerio.load(body);
return $('#results-list li').toArray().map((elem)=>{
var item = $(elem);
return {
title: $('h2', item).text(),
published: $('span.year', item).attr('title'),
url: 'https://link.springer.com/' + $('a.title', item).attr('href'),
author: $('.jhp-css-48d2gn', item).text(),
description: $('.snippet').text(),
categories: ['Nonlinear Dynamics']
};
});
}
};
tasks = [];
for (const url in rules) {
tasks.push(function(asyncCallback){
request(url, function(error, response, body){
if(error) throw error;
asyncCallback(null, rules[url](body));
});
})
}
async.parallel(tasks, function(err, result){
if(err) {
console.error('Something wrong in async parallel.');
throw err;
}
result.reduce((a,b)=>{
return a.concat(b);
}).map(item=>feed.item(item));
console.log(feed.xml({indent: true}));
});
I wrote a nodejs script to grab rss entries, post it here in case anybody else needs it.
RSS is very useful. However, however, there are less websites provide RSS themselvew nowadays. A good idea is to retrieve an index page, and grab items with
css selectors
orxpath selectors
and so on. I'm desperate for this feature.