Closed vanniktech closed 1 year ago
In the meantime I did find HTMLString for iOS which does a superb parsing job. Really fast and I expect/actualed to each representation.
Nonetheless, it would be really amazing if we had a fast implementation of unescapeEntities
here that would also work on the other targets like JS/Desktop.
You can use decodeHtml
which works the same as unescapeEntities
:
val decoded = KsoupEntities.decodeHtml("Hello & World") // return: Hello & World
Try it and let me know if this is what you are looking for
It does not work for instance Emojis are broken:
@Test fun decodeEmoji() {
assertEquals("""🌍""", KsoupEntities.decodeHtml("🌍"))
}
java.lang.IllegalArgumentException at com.mohamedrejeb.ksoup.entities.utils.CharsUtils.toChars(CharsUtils.kt:29) at com.mohamedrejeb.ksoup.entities.text.translate.NumericEntityDecoder.translate(NumericEntityDecoder.kt:110) at com.mohamedrejeb.ksoup.entities.AggregateTranslator.translate(AggregateTranslator.kt:33) at com.mohamedrejeb.ksoup.entities.StringTranslator.translate(StringTranslator.kt:42) at com.mohamedrejeb.ksoup.entities.StringTranslator.translate(StringTranslator.kt:16) at com.mohamedrejeb.ksoup.entities.KsoupEntities.decodeHtml5(KsoupEntities.kt:318) at com.mohamedrejeb.ksoup.entities.KsoupEntities.decodeHtml(KsoupEntities.kt:348)
Emojis are not supported for now. I'll work on adding Emojis support.
Amazing, RegionalIndicatorSymbolLetter
and some kind of alphabet (don't know the name of that region) are also failing, so in general these 3 tests are failing for me right now:
@Test fun pruneSomeKindOfAlphabet() {
assertEquals(expected = """𝟸""", actual = KsoupEntities.decodeHtml("𝟸"))
}
@Test fun pruneRegionalIndicatorSymbolLetter() {
assertEquals(expected = """🇮""", actual = KsoupEntities.decodeHtml("🇮"))
}
@Test fun pruneEmojis() {
assertEquals(expected = """🌍""", actual = KsoupEntities.decodeHtml("🌍"))
}
And then the other ones are all passing 🥳 :
@Test fun pruneBasicLatin() {
assertEquals(expected = "*", actual = KsoupEntities.decodeHtml("*"))
}
@Test fun pruneLatin1Supplement() {
assertEquals(expected = "®", actual = KsoupEntities.decodeHtml("®"))
}
@Test fun pruneLatinExtendedA() {
assertEquals(expected = "Ġ", actual = KsoupEntities.decodeHtml("Ġ"))
}
@Test fun pruneLatinExtendedB() {
assertEquals(expected = "ȭ", actual = KsoupEntities.decodeHtml("ȭ"))
}
@Test fun pruneIPAExtensions() {
assertEquals(expected = "ɠ", actual = KsoupEntities.decodeHtml("ɠ"))
}
@Test fun pruneSpacingModifierLetters() {
assertEquals(expected = "˧", actual = KsoupEntities.decodeHtml("˧"))
}
@Test fun pruneCombiningDiacriticalMarks() {
assertEquals(expected = """͠""", actual = KsoupEntities.decodeHtml("͠"))
}
@Test fun pruneGreek() {
assertEquals(expected = "Ϯ", actual = KsoupEntities.decodeHtml("Ϯ"))
}
@Test fun pruneCyrillic() {
assertEquals(expected = "Ц", actual = KsoupEntities.decodeHtml("Ц"))
}
@Test fun pruneHebrew() {
assertEquals(expected = """ױ""", actual = KsoupEntities.decodeHtml("ױ"))
}
@Test fun pruneArabic() {
assertEquals(expected = "ب", actual = KsoupEntities.decodeHtml("ب"))
}
@Test fun pruneSyriac() {
assertEquals(expected = """܈""", actual = KsoupEntities.decodeHtml("܈"))
}
@Test fun pruneThaana() {
assertEquals(expected = """ޖ""", actual = KsoupEntities.decodeHtml("ޖ"))
}
@Test fun pruneDevanagari() {
assertEquals(expected = """औ""", actual = KsoupEntities.decodeHtml("औ"))
}
@Test fun pruneBengali() {
assertEquals(expected = """৺""", actual = KsoupEntities.decodeHtml("৺"))
}
@Test fun pruneGurmukhi() {
assertEquals(expected = """ਆ""", actual = KsoupEntities.decodeHtml("ਆ"))
}
@Test fun pruneGujarati() {
assertEquals(expected = """ઈ""", actual = KsoupEntities.decodeHtml("ઈ"))
}
@Test fun pruneOriya() {
assertEquals(expected = """୯""", actual = KsoupEntities.decodeHtml("୯"))
}
@Test fun pruneTamil() {
assertEquals(expected = """௫""", actual = KsoupEntities.decodeHtml("௫"))
}
@Test fun pruneTelugu() {
assertEquals(expected = """౭""", actual = KsoupEntities.decodeHtml("౭"))
}
@Test fun pruneKannada() {
assertEquals(expected = """೯""", actual = KsoupEntities.decodeHtml("೯"))
}
@Test fun pruneMalayalam() {
assertEquals(expected = """ഗ""", actual = KsoupEntities.decodeHtml("ഗ"))
}
@Test fun pruneSinhala() {
assertEquals(expected = """ඊ""", actual = KsoupEntities.decodeHtml("ඊ"))
}
@Test fun pruneThai() {
assertEquals(expected = """ค""", actual = KsoupEntities.decodeHtml("ค"))
}
@Test fun pruneLao() {
assertEquals(expected = """ຖ""", actual = KsoupEntities.decodeHtml("ຖ"))
}
@Test fun pruneTibetan() {
assertEquals(expected = """࿏""", actual = KsoupEntities.decodeHtml("࿏"))
}
@Test fun pruneMyanmar() {
assertEquals(expected = """ည""", actual = KsoupEntities.decodeHtml("ည"))
}
@Test fun pruneGeorgian() {
assertEquals(expected = """Ⴂ""", actual = KsoupEntities.decodeHtml("Ⴂ"))
}
@Test fun pruneLatinExtendedAdditional() {
assertEquals(expected = "Ỹ", actual = KsoupEntities.decodeHtml("Ỹ"))
}
@Test fun pruneGreekExtended() {
assertEquals(expected = "ἁ", actual = KsoupEntities.decodeHtml("ἁ"))
}
@Test fun pruneGeneralPunctuation() {
assertEquals(expected = "‒", actual = KsoupEntities.decodeHtml("‒"))
}
@Test fun pruneSuperscriptsAndSubscripts() {
assertEquals(expected = "⁾", actual = KsoupEntities.decodeHtml("⁾"))
}
@Test fun pruneCurrencySymbols() {
assertEquals(expected = "₦", actual = KsoupEntities.decodeHtml("₦"))
}
@Test fun pruneCombiningMarksForSymbols() {
assertEquals(expected = """⃝""", actual = KsoupEntities.decodeHtml("⃝"))
}
@Test fun pruneLetterLikeSymbols() {
assertEquals(expected = "℈", actual = KsoupEntities.decodeHtml("℈"))
}
@Test fun pruneNumberForms() {
assertEquals(expected = "⅛", actual = KsoupEntities.decodeHtml("⅛"))
}
@Test fun pruneArrows() {
assertEquals(expected = "↘", actual = KsoupEntities.decodeHtml("↘"))
}
@Test fun pruneMathematicalOperators() {
assertEquals(expected = "∂", actual = KsoupEntities.decodeHtml("∂"))
}
@Test fun pruneMiscellaneousTechnical() {
assertEquals(expected = """⌘""", actual = KsoupEntities.decodeHtml("⌘"))
}
@Test fun pruneEnclosedAlphanumerics() {
assertEquals(expected = "⑩", actual = KsoupEntities.decodeHtml("⑩"))
}
@Test fun pruneGeometricShapes() {
assertEquals(expected = "◂", actual = KsoupEntities.decodeHtml("◂"))
}
@Test fun pruneMiscellaneousSymbols() {
assertEquals(expected = "☇", actual = KsoupEntities.decodeHtml("☇"))
}
@Test fun pruneDingbats() {
assertEquals(expected = "➼", actual = KsoupEntities.decodeHtml("➼"))
}
@Test fun pruneCjkRadicalsSupplement() {
assertEquals(expected = """⺉""", actual = KsoupEntities.decodeHtml("⺉"))
}
@Test fun pruneKangxiRadicals() {
assertEquals(expected = "⼉", actual = KsoupEntities.decodeHtml("⼉"))
}
@Test fun pruneCjkSymbolsAndPunctuation() {
assertEquals(expected = "〆", actual = KsoupEntities.decodeHtml("〆"))
}
@Test fun pruneHiragana() {
assertEquals(expected = """ゔ""", actual = KsoupEntities.decodeHtml("ゔ"))
}
@Test fun pruneKatakana() {
assertEquals(expected = "オ", actual = KsoupEntities.decodeHtml("オ"))
}
@Test fun pruneHalfWidthAndFullWidthForms() {
assertEquals(expected = """→""", actual = KsoupEntities.decodeHtml("→"))
}
@Test fun pruneSomeKindOfChinese() {
assertEquals(expected = """不""", actual = KsoupEntities.decodeHtml("不"))
}
Feel free to use 'em.
Thanks for reporting this. I'll make sure to fix it ASAP.
It would be immensely helpful if the
ksoup-entites
module would provide an Parser.unescapeEntities equivalent Jsoup API:https://jsoup.org/apidocs/org/jsoup/parser/Parser.html#unescapeEntities(java.lang.String,boolean)
With that one you can unescape all of the
Ӓ
entities.My use case is that I have an RSS Reader and often times even though it's not required the content is HTML encoded and hence I need to unescape it. For instance here's on such feed:
https://lexfridman.com/feed/podcast/
Note though that ideally any unicode would be supported, here are some more feeds which use a lot of escaped HTML entites:
https://vandal.elespanol.com/xml.cgi https://open.firstory.me/rss/user/cklqae6gy388f0892i2qmv851 https://blog.codinghorror.com/rss/ https://www.ivoox.com/finanzas-personales-libertad-financiera_fg_f1703990_filtro_1.xml