dwhieb / Nisinoon

Website for the Algonquian Components Project (Nisinoon)
https://nisinoon.net
MIT License
1 stars 0 forks source link

Get list of used Unicode points #162

Closed dwhieb closed 3 months ago

dwhieb commented 3 months ago

This is done. There were no noticeable cases where the same character was encoded with different unicode points.

Here's the code used to construct the list:

import Components from './data/Components.js'

const components = new Components

await components.load()

let unicodePoints = new Map

function extractUnicodePoints(str) {
  for (const char of str) {
    unicodePoints.set(char, `\\u${ char.charCodeAt(0).toString(16).padStart(4, 0) }`)
  }
}

for (const component of components.values()) {
  for (const { form, PA, UR } of component.tokens) {
    if (form) extractUnicodePoints(form)
    if (PA) extractUnicodePoints(PA)
    if (UR) extractUnicodePoints(UR)
  }
}

unicodePoints = Array.from(unicodePoints.entries())

unicodePoints.sort(([a], [b]) => a.localeCompare(b))

unicodePoints = new Map(unicodePoints)

console.log(unicodePoints)