CharsetDetector / UTF-unknown

Character set detector build in C# - .NET 5+, .NET Core 2+, .NET standard 1+ & .NET 4+
303 stars 45 forks source link

UTF16 LE is not detected correctly #164

Open Mosch0512 opened 9 months ago

Mosch0512 commented 9 months ago

UTF16_Test.csv

For the provided file the encoding is not detected as UTF16 LE even though it is.

detector.Encoding = ASCIIEncoding.ASCIIEncodingSealed
 BodyName = {string} "us-ascii"
 CodePage = {int} 20127
 DecoderFallback = DecoderReplacementFallback
 EncoderFallback = EncoderReplacementFallback
 EncodingName = {string} "US-ASCII"
 HeaderName = {string} "us-ascii"
 IsBrowserDisplay = {bool} false
 IsBrowserSave = {bool} false
 IsMailNewsDisplay = {bool} true
 IsMailNewsSave = {bool} true
 IsReadOnly = {bool} true
 IsSingleByte = {bool} true
 IsUTF8CodePage = {bool} false
 Preamble = {ReadOnlySpan<byte>} System.ReadOnlySpan<Byte>[0]
 WebName = {string} "us-ascii"
 WindowsCodePage = {int} 1252
 _codePage = {int} 20127
 _dataItem = CodePageDataItem
 _isReadOnly = {bool} true
 decoderFallback = DecoderReplacementFallback
 encoderFallback = EncoderReplacementFallback

As workaround i am now using this code

byte[] byteArray = File.ReadAllBytes(filePath);

// count every second byte array if its zero.
int zeroBytesCount = 0;
for (int i = 1; i < byteArray.Length; i += 2)
{
    if (byteArray[i] == 0)
    {
        zeroBytesCount++;
    }
}

Encoding encoding = Encoding.UTF8;
// if count is bigger or equal to 40% of the byte array, it most likely UTF16
if (zeroBytesCount >= byteArray.Length * 0.4)
{
    encoding = Encoding.Unicode;
}
else
{
    DetectionDetail detector = CharsetDetector.DetectFromBytes(byteArray).Detected;
    encoding = detector.Encoding;
}