最近一段做一些关于文字编码方面的东西,常常涉及到各种编码字符之间的转换。主要是做中日文方面的,包括中文gb2312, 日文JIS, SHIFT-JIS,以及他们和Unnicode码之间的转换。
一 GBK <==> Unicode
unsigned short GBK2UNI(unsigned short usGBK) { unsigned char szEUC[2] = { usGBK >> 8, usGBK & 0xFF }; unsigned short usUNI; MultiByteToWideChar( 936, 0, (LPCSTR)szEUC, 2, &usUNI, 1 ); return usUNI; }
unsigned short UNI2GBK(unsigned short usUNI) { unsigned char szGBK[3]={0}; unsigned short wzUNI[2] = { usUNI, 0 }; unsigned short usGBK; WideCharToMultiByte( 936, 0, wzUNI, 2, (LPSTR)szGBK, 2, 0, 0 ); usGBK = (szGBK[0] << 8) | szGBK[1]; return usGBK; }
二 SHIFT-JIS <==> Unicode unsigned short SJIS2UNI(unsigned short usSJIS) { unsigned char szEUC[2] = { usSJIS >> 8, usSJIS & 0xFF }; unsigned short usUNI; MultiByteToWideChar( 932, 0, (LPCSTR)szEUC, 2, &usUNI, 1 ); return usUNI; } unsigned short UNI2SJIS(unsigned short usUNI) { unsigned char szSJIS[3] = { 0 }; unsigned short wzUNI[2] = { usUNI, 0 }; unsigned short usSJIS; WideCharToMultiByte( 932, 0, wzUNI, 2, (LPSTR)szSJIS, 2, 0, 0 ); usSJIS = (szSJIS[0] << 8) | szSJIS[1]; return usSJIS; }
三 JIS <=> Unicode
unsigned short JIS2UNI(unsigned short usJIS) { unsigned char szEUC[2] = { (usJIS | 0x8080) >> 8, (usJIS | 0x8080) & 0xFF }; unsigned short usUNI; MultiByteToWideChar( 20932, 0, (LPCSTR)szEUC, 2, &usUNI, 1 ); return usUNI; } unsigned short UNI2JIS(unsigned short usUNI) { unsigned char szJIS[3] = { 0 }; unsigned short wzUNI[2] = { usUNI, 0 }; unsigned short usJIS; WideCharToMultiByte( 20932, 0, wzUNI, 2, (LPSTR)szJIS, 2, 0, 0 ); usJIS = (szJIS[0] << 8) | szJIS[1]; return usJIS; }
四 JIS <=> SHIFT-JIS unsigned short SJIS2JIS( unsigned short sjis ) { unsigned short ubyte, lbyte; if (((sjis >= 0x8140) && (sjis <= 0x9ffc)) || ((sjis >= 0xe040) && (sjis <= 0xeffc)) ) { ubyte = sjis >> 8; lbyte = sjis & 0x00ff; if ( (lbyte <= 0x3f) || (lbyte == 0x7f) || (lbyte >= 0xfd) ) return 0; if ( ubyte >= 0xe0 ) ubyte -= 0xc0; else ubyte -= 0x80; ubyte = (ubyte << 1) + 0x1f; if ( lbyte >= 0x9f ) { ubyte++; lbyte -= 0x7e; } else { if ( lbyte >= 0x80 ) lbyte--; lbyte -= 0x1f; } return ( ubyte << 8 ) + lbyte; } else { return 0; } }
unsigned short JIS2SJIS( unsigned short jis ) { unsigned short ubyte, lbyte; ubyte = jis >> 8; lbyte = jis & 0x00ff; lbyte += 0x1f; if ( lbyte >= 0x7f ) lbyte++; if ( lbyte <= 0x3f ) return 0; if ( (ubyte & 0x0001) == 0 ) { lbyte = jis & 0x00ff; lbyte += 0x7e; ubyte--; if ( lbyte > 0xfd ) return 0; } ubyte -= 0x1f; ubyte = ubyte >> 1; ubyte += 0x80; if ( ubyte >= 0xa0 ) ubyte += 0x40; if ( ((ubyte >= 0x81) && (ubyte <= 0x9f)) || ((ubyte >= 0xe0) && (ubyte <= 0xef)) ) { return (ubyte << 8) + lbyte; } else { return 0; } }
UNI2JIS这个函数好像不太好用,其他的都经过测试,没有问题的。现在我还不知道具体原因,现在我从Unicode转到JIS是分两个步骤的,第一个步骤是先将Unicode转到SHIFT-JIS,然后由SHIFT-JIS转到JIS。如果那位朋友知道什么原因,欢迎在我博客里指点指点。
好了,就这些,希望能给需要的朋友们带来一点方便。
转载于:https://www.cnblogs.com/antony1029/archive/2008/09/20/1294946.html
相关资源:Unicode编码表_多国语言