Leonidas-from-XIV / node-xml2js

XML to JavaScript object converter.
MIT License
4.88k stars 604 forks source link

2 problems when parse xml #243

Closed kalitas closed 8 years ago

kalitas commented 8 years ago

here is the xml:http://www.f4l.org.il/site/shop/compareEngine.asp?detail_id=4161674&depart_id=261546&allUrl=1

when it parsed to json here is a part of what I got: { "STORE": { "$": { "URL": "http://www.f4l.org.il/site/detail/detail/detailDetail.asp?detail_id=4161674", "DATE": "6/10/2015", "TIME": "11:51:52", "NAME": "����� ����-NUTRA GOLD", "STATUS": "ONLINE", "ID": "" }, "PRODUCTS": { "PRODUCT": [ { "$": { "NUM": "1" }, "PRODUCT_URL": "\"http://www.f4l.org.il/site/detail/detail/detailDetail.asp?detail_id=4161712\"", "name": "\"����� ���� ��� ��� ��� 3 �"�\"", "MODEL": "\"\"", "description": "\"\"", "CATALOG_NUMBER": "\"NG1111003000\"", "CURRENCY": "\"ILS\"", "pricePerUnit": "\"89\"", "SHIPMENT_COST": "\"0\"", "DELIVERY_TIME": "\"\"", "MANUFACTURER": "\"\"", "WARRANTY": "\"0\"", "imageUrl": "\"http://www.f4l.org.il/image/users/261546/detail/big/4161712-8149.jpg\"", "TAX": "\"1\"" },

as you can see there are additional bracelets in each text and the encoding is bad... any solution for me?

edit: the first solution was salved by set utf8 to my response. however the second one (addiotnal bracelets) is not solved

tflanagan commented 8 years ago

Not sure what you mean by "additional bracelets"

clystian commented 8 years ago

@kalitas you get double quotes "" because your xml contains data betwenn "", so you get same content. its not modify, its not problem with xml2js

Look, this its a part of your xml

<PRODUCT NUM="1">
    <PRODUCT_URL>
        "http://www.f4l.org.il/site/detail/detail/detailDetail.asp?detail_id=4161712"
    </PRODUCT_URL>
    <PRODUCT_NAME>"נוטרה גולד כלב גזע קטן 3 ק&quot;ג"</PRODUCT_NAME>
    <MODEL>""</MODEL>
    <DETAILS>
        <![CDATA[ "" ]]>
    </DETAILS>
    <CATALOG_NUMBER>"NG1111003000"</CATALOG_NUMBER>
    <CURRENCY>"ILS"</CURRENCY>
    <PRICE>"89"</PRICE>
    <SHIPMENT_COST>"0"</SHIPMENT_COST>
    <DELIVERY_TIME>""</DELIVERY_TIME>
    <MANUFACTURER>""</MANUFACTURER>
    <WARRANTY>"0"</WARRANTY>
    <IMAGE>
        "http://www.f4l.org.il/image/users/261546/detail/big/4161712-8149.jpg"
    </IMAGE>
    <TAX>"1"</TAX>
</PRODUCT>

you can see double quotes (bracelets)?

If you want remove them you can do this before use xml2js

var xml = loadXML('........');

xml = xml.replace(/>\s*\"/ig,'>').replace(/\"\s*</ig,'<'); // remove any double quote `"` envolves your content 

var json = parser.parseString(xml);

try it!

tflanagan commented 8 years ago

@clystian That regex ignores <![CDATA[ ... ]]> which is to be completely ignored by the parser.

Just a heads up, fyi, etc, etc

kalitas commented 8 years ago

Hi guys, thanks for the answer I have 7 years experience in iOS and zero in server side :( Anyway, @clystian that solved the double quoted problem. 1.what about the CDATA?: it parsed as "" even if I set ccdata:true 2.Im having difficulties with the encoding (which I thought i solved already); the hebrew look like that: (look in the NAME Tag) { "STORE": { "$": { "URL": "http://www.f4l.org.il/site/detail/detail/detailDetail.asp?detail_id=4161674", "DATE": "7/10/2015", "TIME": "10:40:15", "NAME": "����� ����-NUTRA GOLD", "STATUS": "ONLINE", "ID": "" }, "PRODUCTS": { "PRODUCT": [ { "$": { "NUM": "1" }, "PRODUCT_URL": "http://www.f4l.org.il/site/detail/detail/detailDetail.asp?detail_id=4161712", "name": "����� ���� ��� ��� ��� 3 �"�", "MODEL": "", "description": "\"\"", "CATALOG_NUMBER": "NG1111003000", "CURRENCY": "ILS", "pricePerUnit": "89", "SHIPMENT_COST": "0", "DELIVERY_TIME": "", "MANUFACTURER": "", "WARRANTY": "0", "imageUrl": "http://www.f4l.org.il/image/users/261546/detail/big/4161712-8149.jpg", "TAX": "1" },

this is how i get my xml from the server:

function getXmlFromServer(url, callback) { var req = http.get(url, function(res) { res.setEncoding('utf8') var xml = ""; res.on('data', function(chunk) { xml += chunk; });

res.on('error', function(e) {
  callback(e, null);
}); 

res.on('timeout', function(e) {
  callback(e, null);
}); 

res.on('end', function() {
  nonDoublequetedXML = removeDoubleQuotes(xml);
  parseString(nonDoublequetedXML, {explicitArray:false,tagNameProcessors:[modifyTags]},           function(err, result) {
    fs.writeFile("./xmlresults.json", JSON.stringify(result, null, '\t'));
    callback(null, result);
  });
});

}); }

I tried so many things but nothing is fixing the problem. by the way when I look at the XML from the browser (chrome) I see the hebrew fine...

kalitas commented 8 years ago

OK found the encoding solution: i used xml += chunk and then encode, while I should do is newBuffer = Buffer.concat([newBuffer, chunk]) and now encode. thanks anyway. if you have any idea why the CDATA parse as "\"\"" please let me know...

tflanagan commented 8 years ago

Because this is invalid data:

"description": """"

That is two strings missing an operator, so they must be escaped to be valid JSON. Hence the backslashes.

Leonidas-from-XIV commented 8 years ago

So, I suppose this is solved and had no actual bugs in xml2js, so I'm closing it.