首先,首先一个enum class作为检测的返回值
enum class Encode { ANSI = 1, UNICODE_LE, UNICODE_BE, UTF8, UTF8_NOBOM };然后我们可以根据上面总结的规律进行判断。
Encode DetectEncode(const PBYTE pBuffer, long length) { if (pBuffer[0] == 0xFF && pBuffer[1] == 0xFE) { return Encode::UNICODE_LE; } else if (pBuffer[0] == 0xFE && pBuffer[1] == 0xFF) { return Encode::UNICODE_BE; } else if (pBuffer[0] == 0xEF && pBuffer[1] == 0xBB && pBuffer[2] == 0xBF) { return Encode::UTF8; } else if (CheckUnicodeWithoutBOM(pBuffer, length)) { return Encode::UTF8_NOBOM; } else { return Encode::ANSI; } }下面附上如何检测UTF-8 without BOM的代码实现。
BOOL CheckUnicodeWithoutBOM(const PBYTE pText, long length) { int i; DWORD nBytes = 0; UCHAR chr; BOOL bAllAscii = TRUE; for (i = 0; i < length; i++) { chr = *(pText + i); if ((chr & 0x80) != 0) bAllAscii = FALSE; if (nBytes == 0) { if (chr >= 0x80) { if (chr >= 0xFC && chr <= 0xFD) nBytes = 6; else if (chr >= 0xF8) nBytes = 5; else if (chr >= 0xF0) nBytes = 4; else if (chr >= 0xE0) nBytes = 3; else if (chr >= 0xC0) nBytes = 2; else { return FALSE; } nBytes--; } } else { if ((chr & 0xC0) != 0x80) { return FALSE; } nBytes--; } } if (nBytes > 0) { return FALSE; } if (bAllAscii) { return FALSE; } return TRUE; }转载于:https://www.cnblogs.com/lkpp/p/encoding_detection.html