Public Shared Function GuessEncoding(s As String) as TextEncoding
// Guess what text encoding the text in the given string is in.
// This ignores the encoding set on the string, and guesses
// one of the following:
//
// * UTF-32
// * UTF-16
// * UTF-8
// * Encodings.SystemDefault
//
// Written by Joe Strout, slightly modernized by Thomas Tempelmann
#pragma DisableBackgroundTasks
#pragma DisableBoundsChecking
static isBigEndian, endianChecked As Boolean
if not endianChecked then
Dim temp As String = Encodings.UTF16.Chr( &hFEFF )
isBigEndian = (AscB( MidB( temp, 1, 1 ) ) = &hFE)
endianChecked = true
end if
// check for a BOM
Dim b0 As Integer = AscB( s.MidB( 1, 1 ) )
Dim b1 As Integer = AscB( s.MidB( 2, 1 ) )
Dim b2 As Integer = AscB( s.MidB( 3, 1 ) )
Dim b3 As Integer = AscB( s.MidB( 4, 1 ) )
if b0=0 and b1=0 and b2=&hFE and b3=&hFF then
// UTF-32, big-endian
if isBigEndian then
#if RBVersion >= 2012.02
return Encodings.UTF32
#else
return Encodings.UCS4
#endif
else
return Encodings.UTF32BE
end if
elseif b0=&hFF and b1=&hFE and b2=0 and b3=0 and s.LenB >= 4 then
// UTF-32, little-endian
if isBigEndian then
return Encodings.UTF32LE
else
#if RBVersion >= 2012.02
return Encodings.UTF32
#else
return Encodings.UCS4
#endif
end if
elseif b0=&hFE and b1=&hFF then
// UTF-16, big-endian
if isBigEndian then
return Encodings.UTF16
else
return Encodings.UTF16BE
end if
elseif b0=&hFF and b1=&hFE then
// UTF-16, little-endian
if isBigEndian then
return Encodings.UTF16LE
else
return Encodings.UTF16
end if
elseif b0=&hEF and b1=&hBB and b1=&hBF then
// UTF-8 (ah, a sensible encoding where endianness doesn't matter!)
return Encodings.UTF8
end if
// no BOM; see if it's entirely ASCII.
Dim m As MemoryBlock = s
Dim i, maxi As Integer = s.LenB - 1
for i = 0 to maxi
if m.Byte(i) > 127 then exit
next
if i > maxi then return Encodings.ASCII
// Not ASCII; check for a high incidence of nulls every other byte,
// which suggests UTF-16 (at least in Roman text).
Dim nulls(1) As Integer // null count in even (0) and odd (1) bytes
for i = 0 to maxi
if m.Byte(i) = 0 then
nulls(i mod 2) = nulls(i mod 2) + 1
end if
next
if nulls(0) > nulls(1)*2 and nulls(0) > maxi\2 then
// UTF-16, big-endian
if isBigEndian then
return Encodings.UTF16
else
return Encodings.UTF16BE
end if
elseif nulls(1) > nulls(0)*2 and nulls(1) > maxi\2 then
// UTF-16, little-endian
if isBigEndian then
return Encodings.UTF16LE
else
return Encodings.UTF16
end if
end if
// it's not ASCII; check for illegal UTF-8 characters.
// See Table 3.1B, "Legal UTF-8 Byte Sequences",
// at <http://unicode.org/versions/corrigendum1.html>
Dim b As Byte
for i = 0 to maxi
select case m.Byte(i)
case &h00 to &h7F
// single-byte character; just continue
case &hC2 to &hDF
// one additional byte
if i+1 > maxi then exit for
b = m.Byte(i+1)
if b < &h80 or b > &hBF then exit for
i = i+1
case &hE0
// two additional bytes
if i+2 > maxi then exit for
b = m.Byte(i+1)
if b < &hA0 or b > &hBF then exit for
b = m.Byte(i+2)
if b < &h80 or b > &hBF then exit for
i = i+2
case &hE1 to &hEF
// two additional bytes
if i+2 > maxi then exit for
b = m.Byte(i+1)
if b < &h80 or b > &hBF then exit for
b = m.Byte(i+2)
if b < &h80 or b > &hBF then exit for
i = i+2
case &hF0
// three additional bytes
if i+3 > maxi then exit for
b = m.Byte(i+1)
if b < &h90 or b > &hBF then exit for
b = m.Byte(i+2)
if b < &h80 or b > &hBF then exit for
b = m.Byte(i+3)
if b < &h80 or b > &hBF then exit for
i = i+3
case &hF1 to &hF3
// three additional bytes
if i+3 > maxi then exit for
b = m.Byte(i+1)
if b < &h80 or b > &hBF then exit for
b = m.Byte(i+2)
if b < &h80 or b > &hBF then exit for
b = m.Byte(i+3)
if b < &h80 or b > &hBF then exit for
i = i+3
case &hF4
// three additional bytes
if i+3 > maxi then exit for
b = m.Byte(i+1)
if b < &h80 or b > &h8F then exit for
b = m.Byte(i+2)
if b < &h80 or b > &hBF then exit for
b = m.Byte(i+3)
if b < &h80 or b > &hBF then exit for
i = i+3
else
exit for
end select
next i
if i > maxi then return Encodings.UTF8 // no illegal UTF-8 sequences, so that's probably what it is
// If not valid UTF-8, then let's just guess the system default.
return Encodings.SystemDefault
End Function
1 Like
Is this correct? you check for BigEndian and return a LittleEndian encoding.
I have to admit I know next to nothing about text encodings and I don’t fully understand your code but that seems wrong to me.
Julen
Well Joe Strout and Thomas must have thought it was correct… I didn’t write it… .seems to work
I see now my mistake, isBigEndian does not refer to the string being analysed but the system.
Julen
Thanks for sharing; it’ll easily be a useful piece of code.