Shapefile 是一种常用的矢量数据格式,保存了地理数据的坐标和属性信息。
ArcGIS 10.2.1之前,Shapefile的编码默认为本地编码,国内默认为GBK,这导致在导入国外提供的shp数据时会出现乱码。
Shapefile 在 .dbf 和 .cpg 中有存储编码页信息,可以通过解析这两个文件自动识别编码页,以修正乱码问题。
Shpfile构成
Shapefile 由多个文件构成,最基本的是 .shp 、.shx、.dbf 文件。
.shp:存储地理数据的坐标信息。.shx:存储地理数据的位置索引,记录每个地理数据在shp文件中的位置,能够快速定位数据。.dbf:存储地理数据的属性信息,以dBase IV的数据表格式存储。
可选的文件如下:
.shp.xml:以xml格式保存元数据。.prj:存储地理坐标系统和投影信息。.cpg:指定.dbf文件的字符编码。.sbn、.sbx:空间索引文件。.ixs:地理编码索引。.mxs:地理编码索引(ODB格式)。.atx:.dbf文件的属性索引。
dbf编码自动识别
.dbf 文件头结构如下: .dbf 文件头中第29个字节(从0开始)表示Language driver ID,其代表的编码页参照以下链接:
http://shapelib.maptools.org/codepage.html?d=1563413688103
节选部分如下:
.cpg中明文存储文件编码,例如:windows-1251。
解析编码页时,优先使用.dbf中的Language Driver ID,若无则取.cpg文件中的编码。
c# 源码:
private System.Text.Encoding GetEncoding(byte languageDriverID
)
{
try
{
switch (languageDriverID
)
{
case 0x00:
case 0x57:
string cpgPath
= Path
.Combine(Path
.GetDirectoryName(_fileName
), Path
.GetFileNameWithoutExtension(_fileName
) + ".cpg");
if (File
.Exists(cpgPath
) == false)
{
if (languageDriverID
== 0x00)
{
return Encoding
.UTF8
;
}
else
{
return Encoding
.Default
;
}
}
try
{
using (StreamReader sr
= new StreamReader(cpgPath
))
{
string txt
= sr
.ReadLine();
if (string.IsNullOrEmpty(txt
))
{
if (languageDriverID
== 0x00)
{
return Encoding
.UTF8
;
}
else
{
return Encoding
.Default
;
}
}
return Encoding
.GetEncoding(txt
);
}
}
catch (Exception ex
)
{
if (languageDriverID
== 0x00)
{
return Encoding
.UTF8
;
}
else
{
return Encoding
.Default
;
}
}
case 0x01:
case 0x09:
case 0x0B:
case 0x0D:
case 0x0F:
case 0x11:
case 0x15:
case 0x18:
case 0x19:
case 0x1B:
return Encoding
.GetEncoding(437);
case 0x02:
case 0x0A:
case 0x0E:
case 0x10:
case 0x12:
case 0x14:
case 0x16:
case 0x1A:
case 0x1D:
case 0x25:
case 0x37:
return Encoding
.GetEncoding(850);
case 0x03:
case 0x58:
case 0x59:
return Encoding
.GetEncoding(1252);
case 0x04:
return Encoding
.GetEncoding(10000);
case 0x08:
case 0x17:
case 0x66:
return Encoding
.GetEncoding(865);
case 0x13:
case 0x7B:
return Encoding
.GetEncoding(932);
case 0x1C:
case 0x6C:
return Encoding
.GetEncoding(863);
case 0x1F:
case 0x22:
case 0x23:
case 0x40:
case 0x64:
case 0x87:
return Encoding
.GetEncoding(852);
case 0x24:
return Encoding
.GetEncoding(860);
case 0x26:
case 0x65:
return Encoding
.GetEncoding(866);
case 0x4D:
case 0x7A:
return Encoding
.GetEncoding(936);
case 0x4E:
case 0x79:
return Encoding
.GetEncoding(949);
case 0x4F:
case 0x78:
return Encoding
.GetEncoding(950);
case 0x50:
case 0x7C:
return Encoding
.GetEncoding(874);
case 0x67:
return Encoding
.GetEncoding(861);
case 0x68:
return Encoding
.GetEncoding(895);
case 0x69:
return Encoding
.GetEncoding(620);
case 0x6A:
case 0x86:
return Encoding
.GetEncoding(737);
case 0x6B:
case 0x88:
return Encoding
.GetEncoding(857);
case 0x96:
return Encoding
.GetEncoding(10007);
case 0x97:
return Encoding
.GetEncoding(10029);
case 0x98:
return Encoding
.GetEncoding(10006);
case 0xC8:
return Encoding
.GetEncoding(1250);
case 0xC9:
return Encoding
.GetEncoding(1251);
case 0xCA:
return Encoding
.GetEncoding(1254);
case 0xCB:
return Encoding
.GetEncoding(1253);
case 0xCC:
return Encoding
.GetEncoding(1257);
default:
return Encoding
.UTF8
;
}
}
catch (Exception ex
)
{
return Encoding
.UTF8
;
}
}
Java 源码:
private String
GetEncoding(byte languageDriverID
) {
try
{
switch (languageDriverID
)
{
case 0x00:
case 0x57:
File cpgPath
= new File(_fileName
.substring(0, _fileName
.length() - 3) + "cpg");
if (cpgPath
.exists() == false)
{
return ExportParameters
.ENCODING_UTF8
;
}
try
{
FileInputStream fs
= new FileInputStream(cpgPath
);
DataInputStream ds
= new DataInputStream(fs
);
String txt
= ds
.readLine();
if (TextUtils
.isEmpty(txt
))
{
return ExportParameters
.ENCODING_UTF8
;
}
return txt
.trim();
}
catch (Exception ex
)
{
return ExportParameters
.ENCODING_UTF8
;
}
case 0x01:
case 0x09:
case 0x0B:
case 0x0D:
case 0x0F:
case 0x11:
case 0x15:
case 0x18:
case 0x19:
case 0x1B:
return "IBM437";
case 0x02:
case 0x0A:
case 0x0E:
case 0x10:
case 0x12:
case 0x14:
case 0x16:
case 0x1A:
case 0x1D:
case 0x25:
case 0x37:
return "ibm850";
case 0x03:
case 0x58:
case 0x59:
return "Windows-1252";
case 0x04:
return "macintosh";
case 0x08:
case 0x17:
case 0x66:
return "IBM865";
case 0x13:
case 0x7B:
return "iso-2022-jp";
case 0x1C:
case 0x6C:
return "IBM863";
case 0x1F:
case 0x22:
case 0x23:
case 0x40:
case 0x64:
case (byte) 0x87:
return "ibm852";
case 0x24:
return "IBM860";
case 0x26:
case 0x65:
return "cp866";
case 0x4D:
case 0x7A:
return "gb2312";
case 0x4E:
case 0x79:
return "ks_c_5601-1987";
case 0x4F:
case 0x78:
return "big5";
case 0x50:
case 0x7C:
return "windows-874";
case 0x67:
return "ibm861";
case 0x68:
return ExportParameters
.ENCODING_UTF8
;
case 0x69:
return ExportParameters
.ENCODING_UTF8
;
case 0x6A:
case (byte) 0x86:
return "ibm737";
case 0x6B:
case (byte) 0x88:
return "ibm857";
case (byte) 0x96:
return "x-mac-cyrillic";
case (byte) 0x97:
return "x-mac-ce";
case (byte) 0x98:
return "x-mac-greek";
case (byte) 0xC8:
return "windows-1250";
case (byte) 0xC9:
return "windows-1251";
case (byte) 0xCA:
return "windows-1254";
case (byte) 0xCB:
return "windows-1253";
case (byte) 0xCC:
return "windows-1257";
default:
return ExportParameters
.ENCODING_UTF8
;
}
}
catch (Exception ex
)
{
return ExportParameters
.ENCODING_UTF8
;
}
}
C 源码
void getEncoding(unsigned char encodingCode
, char *result
, const char * cpgPath
){
switch (encodingCode
)
{
case 0x00:
case 0x57:
{
char encode
[20] = { '\0' };
int cpgResult
= getCpgEncode(cpgPath
, encode
);
if (cpgResult
== 1)
{
strcpy(result
, encode
);
}
else
{
if (encodingCode
== 0x00)
{
strcpy(result
, "UTF-8");
}
else
{
getSystemEncode(encode
);
strcpy(result
, encode
);
}
}
break;
}
case 0x01:
case 0x09:
case 0x0B:
case 0x0D:
case 0x0F:
case 0x11:
case 0x15:
case 0x18:
case 0x19:
case 0x1B:
strcpy(result
, "CP437");
break;
case 0x02:
case 0x0A:
case 0x0E:
case 0x10:
case 0x12:
case 0x14:
case 0x16:
case 0x1A:
case 0x1D:
case 0x25:
case 0x37:
strcpy(result
, "CP850");
break;
case 0x03:
case 0x58:
case 0x59:
strcpy(result
, "CP1252");
break;
case 0x04:
strcpy(result
, "Macintosh");
break;
case 0x08:
case 0x17:
case 0x66:
strcpy(result
, "CP865");
break;
case 0x13:
case 0x7B:
strcpy(result
, "ISO-2022-JP");
break;
case 0x1C:
case 0x6C:
strcpy(result
, "CP863");
break;
case 0x1F:
case 0x22:
case 0x23:
case 0x40:
case 0x64:
case 0x87:
strcpy(result
, "CP852");
break;
case 0x24:
strcpy(result
, "CP860");
break;
case 0x26:
case 0x65:
strcpy(result
, "CP866");
break;
case 0x4D:
case 0x7A:
strcpy(result
, "GB18030");
break;
case 0x4E:
case 0x79:
strcpy(result
, "ISO-2022-KR");
break;
case 0x4F:
case 0x78:
strcpy(result
, "BIG5");
break;
case 0x50:
case 0x7C:
strcpy(result
, "CP874");
break;
case 0x67:
strcpy(result
, "CP861");
break;
case 0x68:
strcpy(result
, "UTF-8");
break;
case 0x69:
strcpy(result
, "UTF-8");
break;
case 0x6A:
case 0x86:
strcpy(result
, "CP737");
break;
case 0x6B:
case 0x88:
strcpy(result
, "CP857");
break;
case 0x96:
strcpy(result
, "MacCyrillic");
break;
case 0x97:
strcpy(result
, "MacCentralEurope");
break;
case 0x98:
strcpy(result
, "MacGreek");
break;
case 0xC8:
strcpy(result
, "CP1250");
break;
case 0xC9:
strcpy(result
, "CP1251");
break;
case 0xCA:
strcpy(result
, "CP1254");
break;
case 0xCB:
strcpy(result
, "CP1253");
break;
case 0xCC:
strcpy(result
, "CP1257");
break;
default:
strcpy(result
, "UTF-8");
break;
}
}
参考资料:
Shape文件的解析shapefile与字符集编码设置