Shpfile属性编码自动识别(C#,Java,C)

mac2025-04-27  5

Shapefile 是一种常用的矢量数据格式,保存了地理数据的坐标和属性信息。

ArcGIS 10.2.1之前,Shapefile的编码默认为本地编码,国内默认为GBK,这导致在导入国外提供的shp数据时会出现乱码。

Shapefile 在 .dbf 和 .cpg 中有存储编码页信息,可以通过解析这两个文件自动识别编码页,以修正乱码问题。

Shpfile构成

Shapefile 由多个文件构成,最基本的是 .shp 、.shx、.dbf 文件。

.shp:存储地理数据的坐标信息。.shx:存储地理数据的位置索引,记录每个地理数据在shp文件中的位置,能够快速定位数据。.dbf:存储地理数据的属性信息,以dBase IV的数据表格式存储。

可选的文件如下:

.shp.xml:以xml格式保存元数据。.prj:存储地理坐标系统和投影信息。.cpg:指定.dbf文件的字符编码。.sbn、.sbx:空间索引文件。.ixs:地理编码索引。.mxs:地理编码索引(ODB格式)。.atx:.dbf文件的属性索引。

dbf编码自动识别

.dbf 文件头结构如下: .dbf 文件头中第29个字节(从0开始)表示Language driver ID,其代表的编码页参照以下链接:

http://shapelib.maptools.org/codepage.html?d=1563413688103

节选部分如下:

.cpg中明文存储文件编码,例如:windows-1251。

解析编码页时,优先使用.dbf中的Language Driver ID,若无则取.cpg文件中的编码。

c# 源码:

/// <summary> /// 根据dbf中的Language Driver ID获取codepage,若没有则取.cpg文件中的编码 /// 代码页对照表:http://shapelib.maptools.org/codepage.html?d=1563413688103 /// </summary> /// <param name="languageDriverID"></param> /// <returns></returns> private System.Text.Encoding GetEncoding(byte languageDriverID) { try { switch (languageDriverID) { case 0x00: case 0x57: // 0x00:读取cpg中的编码,若没有,则默认为utf-8 // 0x57:读取cpg中的编码,若没有,则windows默认为系统编码,android默认为utf-8 string cpgPath = Path.Combine(Path.GetDirectoryName(_fileName), Path.GetFileNameWithoutExtension(_fileName) + ".cpg"); if (File.Exists(cpgPath) == false) { if (languageDriverID == 0x00) { return Encoding.UTF8; } else { return Encoding.Default; } } try { using (StreamReader sr = new StreamReader(cpgPath)) { string txt = sr.ReadLine(); if (string.IsNullOrEmpty(txt)) { if (languageDriverID == 0x00) { return Encoding.UTF8; } else { return Encoding.Default; } } return Encoding.GetEncoding(txt); } } catch (Exception ex) { if (languageDriverID == 0x00) { return Encoding.UTF8; } else { return Encoding.Default; } } case 0x01: case 0x09: case 0x0B: case 0x0D: case 0x0F: case 0x11: case 0x15: case 0x18: case 0x19: case 0x1B: return Encoding.GetEncoding(437);//IBM437 case 0x02: case 0x0A: case 0x0E: case 0x10: case 0x12: case 0x14: case 0x16: case 0x1A: case 0x1D: case 0x25: case 0x37: return Encoding.GetEncoding(850);//ibm850 case 0x03: case 0x58: case 0x59: return Encoding.GetEncoding(1252);//Windows-1252 case 0x04: return Encoding.GetEncoding(10000);//macintosh case 0x08: case 0x17: case 0x66: return Encoding.GetEncoding(865);//IBM865 case 0x13: case 0x7B: return Encoding.GetEncoding(932);//iso-2022-jp case 0x1C: case 0x6C: return Encoding.GetEncoding(863);//IBM863 case 0x1F: case 0x22: case 0x23: case 0x40: case 0x64: case 0x87: return Encoding.GetEncoding(852);//ibm852 case 0x24: return Encoding.GetEncoding(860);//IBM860 case 0x26: case 0x65: return Encoding.GetEncoding(866);//cp866 case 0x4D: case 0x7A: return Encoding.GetEncoding(936);//gb2312 case 0x4E: case 0x79: return Encoding.GetEncoding(949);//ks_c_5601-1987 case 0x4F: case 0x78: return Encoding.GetEncoding(950);//big5 case 0x50: case 0x7C: return Encoding.GetEncoding(874);//windows-874 case 0x67: return Encoding.GetEncoding(861);//ibm861 case 0x68: return Encoding.GetEncoding(895);//系统不存在 case 0x69: return Encoding.GetEncoding(620);//系统不存在 case 0x6A: case 0x86: return Encoding.GetEncoding(737);//ibm737 case 0x6B: case 0x88: return Encoding.GetEncoding(857);//ibm857 case 0x96: return Encoding.GetEncoding(10007);//x-mac-cyrillic case 0x97: return Encoding.GetEncoding(10029);//x-mac-ce case 0x98: return Encoding.GetEncoding(10006);//x-mac-greek case 0xC8: return Encoding.GetEncoding(1250);//windows-1250 case 0xC9: return Encoding.GetEncoding(1251);//windows-1251 case 0xCA: return Encoding.GetEncoding(1254);//windows-1254 case 0xCB: return Encoding.GetEncoding(1253);//windows-1253 case 0xCC: return Encoding.GetEncoding(1257);//windows-1257 default: return Encoding.UTF8; } } catch (Exception ex) { return Encoding.UTF8; } }

Java 源码:

/** * 根据dbf中的Language Driver ID获取codepage,若没有则取.cpg文件中的编码 * 代码页对照表:http://shapelib.maptools.org/codepage.html?d=1563413688103 * @param languageDriverID * @return */ private String GetEncoding(byte languageDriverID) { try { switch (languageDriverID) { case 0x00: case 0x57: // 0x00:读取cpg中的编码,若没有,则默认为utf-8 // 0x57:读取cpg中的编码,若没有,则windows默认为系统编码,android默认为utf-8 File cpgPath = new File(_fileName.substring(0, _fileName.length() - 3) + "cpg"); if (cpgPath.exists() == false) { return ExportParameters.ENCODING_UTF8; } try { FileInputStream fs = new FileInputStream(cpgPath); DataInputStream ds = new DataInputStream(fs); String txt = ds.readLine(); if (TextUtils.isEmpty(txt)) { return ExportParameters.ENCODING_UTF8; } return txt.trim(); } catch (Exception ex) { return ExportParameters.ENCODING_UTF8; } case 0x01: case 0x09: case 0x0B: case 0x0D: case 0x0F: case 0x11: case 0x15: case 0x18: case 0x19: case 0x1B: return "IBM437"; case 0x02: case 0x0A: case 0x0E: case 0x10: case 0x12: case 0x14: case 0x16: case 0x1A: case 0x1D: case 0x25: case 0x37: return "ibm850"; case 0x03: case 0x58: case 0x59: return "Windows-1252"; case 0x04: return "macintosh"; case 0x08: case 0x17: case 0x66: return "IBM865"; case 0x13: case 0x7B: return "iso-2022-jp"; case 0x1C: case 0x6C: return "IBM863"; case 0x1F: case 0x22: case 0x23: case 0x40: case 0x64: case (byte) 0x87: return "ibm852"; case 0x24: return "IBM860"; case 0x26: case 0x65: return "cp866"; case 0x4D: case 0x7A: return "gb2312"; case 0x4E: case 0x79: return "ks_c_5601-1987"; case 0x4F: case 0x78: return "big5"; case 0x50: case 0x7C: return "windows-874"; case 0x67: return "ibm861"; case 0x68: return ExportParameters.ENCODING_UTF8;//Encoding.GetEncoding(895);//系统不存在 case 0x69: return ExportParameters.ENCODING_UTF8;//Encoding.GetEncoding(620);//系统不存在 case 0x6A: case (byte) 0x86: return "ibm737"; case 0x6B: case (byte) 0x88: return "ibm857"; case (byte) 0x96: return "x-mac-cyrillic"; case (byte) 0x97: return "x-mac-ce"; case (byte) 0x98: return "x-mac-greek"; case (byte) 0xC8: return "windows-1250"; case (byte) 0xC9: return "windows-1251"; case (byte) 0xCA: return "windows-1254"; case (byte) 0xCB: return "windows-1253"; case (byte) 0xCC: return "windows-1257"; default: return ExportParameters.ENCODING_UTF8; } } catch (Exception ex) { return ExportParameters.ENCODING_UTF8; } }

C 源码

void getEncoding(unsigned char encodingCode, char *result, const char * cpgPath){ switch (encodingCode) { // 0x00:读取cpg中的编码,若没有,则默认为utf-8 // 0x57:读取cpg中的编码,若没有,则windows默认为系统编码,android默认为utf-8 case 0x00: case 0x57: { char encode[20] = { '\0' }; int cpgResult = getCpgEncode(cpgPath, encode); if (cpgResult == 1) { strcpy(result, encode); } else { if (encodingCode == 0x00) { strcpy(result, "UTF-8"); } else { getSystemEncode(encode); strcpy(result, encode); } } break; } case 0x01: case 0x09: case 0x0B: case 0x0D: case 0x0F: case 0x11: case 0x15: case 0x18: case 0x19: case 0x1B: strcpy(result, "CP437"); break; case 0x02: case 0x0A: case 0x0E: case 0x10: case 0x12: case 0x14: case 0x16: case 0x1A: case 0x1D: case 0x25: case 0x37: strcpy(result, "CP850"); break; case 0x03: case 0x58: case 0x59: strcpy(result, "CP1252"); break; case 0x04: strcpy(result, "Macintosh"); break; case 0x08: case 0x17: case 0x66: strcpy(result, "CP865");//IBM865 break; case 0x13: case 0x7B: strcpy(result, "ISO-2022-JP");//iso-2022-jp break; case 0x1C: case 0x6C: strcpy(result, "CP863");//IBM863 break; case 0x1F: case 0x22: case 0x23: case 0x40: case 0x64: case 0x87: strcpy(result, "CP852");//ibm852 break; case 0x24: strcpy(result, "CP860"); break; case 0x26: case 0x65: strcpy(result, "CP866");//cp866 break; case 0x4D: case 0x7A: strcpy(result, "GB18030");//gb2312 break; case 0x4E: case 0x79: strcpy(result, "ISO-2022-KR");//ks_c_5601-1987 break; case 0x4F: case 0x78: strcpy(result, "BIG5");//big5 break; case 0x50: case 0x7C: strcpy(result, "CP874");//windows-874 break; case 0x67: strcpy(result, "CP861");//ibm861 break; case 0x68: strcpy(result, "UTF-8");//系统不存在 break; case 0x69: strcpy(result, "UTF-8");//系统不存在 break; case 0x6A: case 0x86: strcpy(result, "CP737");//ibm737 break; case 0x6B: case 0x88: strcpy(result, "CP857");//ibm857 break; case 0x96: strcpy(result, "MacCyrillic");//x-mac-cyrillic break; case 0x97: strcpy(result, "MacCentralEurope");//x-mac-ce break; case 0x98: strcpy(result, "MacGreek");//x-mac-greek break; case 0xC8: strcpy(result, "CP1250");//windows-1250 break; case 0xC9: strcpy(result, "CP1251");//windows-1251 break; case 0xCA: strcpy(result, "CP1254");//windows-1254 break; case 0xCB: strcpy(result, "CP1253");//windows-1253 break; case 0xCC: strcpy(result, "CP1257");//windows-1257 break; default: strcpy(result, "UTF-8"); break; } }

参考资料:

Shape文件的解析shapefile与字符集编码设置
最新回复(0)