Leonidas-from-XIV / node-xml2js

XML to JavaScript object converter.
MIT License
4.84k stars 596 forks source link

Error: Unexpected Close Tag #679

Closed asontha closed 1 year ago

asontha commented 1 year ago

Hey! I have a script that loads sitemaps, and then uses xml2js to convert them into JSON for use. Previously it worked just fine, but now I seem to always be getting an unexpected close tag. Did something change? I recently tried upgrading to 0.5.0 to see if that would fix it but still running into the same issue.

Sitemap url: https://www.zurichna.com/sitemap.xml

Version ^0.4.23

 Error: Unexpected close tag
Line: 6
Column: 14
Char: >
    at error (<SCRIPT_PATH>/node_modules/sax/lib/sax.js:667:10)
    at strictFail (<SCRIPT_PATH>/node_modules/sax/lib/sax.js:693:7)
    at closeTag (<SCRIPT_PATH>/node_modules/sax/lib/sax.js:887:9)
    at SAXParser.write (<SCRIPT_PATH>/node_modules/sax/lib/sax.js:1449:13)
    at exports.Parser.Parser.parseString (<SCRIPT_PATH>/node_modules/xml2js/lib/parser.js:323:31)
    at Parser.parseString (<SCRIPT_PATH>/node_modules/xml2js/lib/parser.js:5:59)
    at <SCRIPT_PATH>/node_modules/xml2js/lib/parser.js:338:24
    at new Promise (<anonymous>)
    at exports.Parser.Parser.parseStringPromise (<SCRIPT_PATH>/node_modules/xml2js/lib/parser.js:336:14)
    at Parser.parseStringPromise (<SCRIPT_PATH>/node_modules/xml2js/lib/parser.js:5:59)
<SCRIPT_PATH>/scraping.js:173
      site_urls = site_map_json.urlset.url.map(u => u.loc[0])
                                       ^

TypeError: Cannot read properties of undefined (reading 'url')
    at main (<SCRIPT_PATH>/scraping.js:173:40)
    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)

Version ^0.5.0

Error: Unexpected close tag
Line: 6
Column: 14
Char: >
    at error (<SCRIPT_PATH>/node_modules/sax/lib/sax.js:667:10)
    at strictFail (<SCRIPT_PATH>/node_modules/sax/lib/sax.js:693:7)
    at closeTag (<SCRIPT_PATH>/node_modules/sax/lib/sax.js:887:9)
    at SAXParser.write (<SCRIPT_PATH>/node_modules/sax/lib/sax.js:1449:13)
    at exports.Parser.Parser.parseString (<SCRIPT_PATH>/node_modules/xml2js/lib/parser.js:327:31)
    at Parser.parseString (<SCRIPT_PATH>/node_modules/xml2js/lib/parser.js:5:59)
    at <SCRIPT_PATH>/node_modules/xml2js/lib/parser.js:342:24
    at new Promise (<anonymous>)
    at exports.Parser.Parser.parseStringPromise (<SCRIPT_PATH>/node_modules/xml2js/lib/parser.js:340:14)
    at Parser.parseStringPromise (<SCRIPT_PATH>/node_modules/xml2js/lib/parser.js:5:59)
<SCRIPT_PATH>/scraping.js:173
      site_urls = site_map_json.urlset.url.map(u => u.loc[0])
                                       ^

TypeError: Cannot read properties of undefined (reading 'url')
    at main (<SCRIPT_PATH>/scraping.js:173:40)
    at process.processTicksAndRejections (node:internal/process/task_queues:95:5)

Here's how I'm using xml2js:


async function fetchSiteMapXML(sitemap_url) {
  return axios.get(sitemap_url)
  .then(response => {
    return response;
  }).catch(error => {
    console.log('fetchSiteMapXML Request Failed:', error);
    return error;
  });
}

async function convertXMLToJSON(xml) {
  var parser = new xml2js.Parser();
  return parser.parseStringPromise(xml)
  .then(result => {
    console.log("JSON conversion complete")
    return result;
  })
  .catch(error => {
    console.log("JSON conversion failed");
    return error
  });
}

The output of fetchSiteMapXML is fed directly into convertXMLToJSON.

asontha commented 1 year ago

Here's how these are being called in the script

let site_map_response = await fetchSiteMapXML(site_map_url);

let site_map_json = await convertXMLToJSON(site_map_response.data);
console.log("Sitemap JSON: ", site_map_json);
site_urls = site_map_json.urlset.url.map(u => u.loc[0])
asontha commented 1 year ago

Turns out the website had a special bot blocker on their sitemap.