需求:
某文件或者某字节流要检测他的编码格式。
实现:
基于jchardet
代码如下:
public class DetectorUtils { private DetectorUtils() { } static class ChineseCharsetDetectionObserver implements nsICharsetDetectionObserver { private boolean found = false; private String result; public void Notify(String charset) { found = true; result = charset; } public ChineseCharsetDetectionObserver(boolean found, String result) { super(); this.found = found; this.result = result; } public boolean isFound() { return found; } public String getResult() { return result; } } public static String[] detectChineseCharset(InputStream in) throws Exception { String[] prob=null; BufferedInputStream imp = null; try { boolean found = false; String result = Charsets.UTF_8.toString(); int lang = nsPSMDetector.CHINESE; nsDetector det = new nsDetector(lang); ChineseCharsetDetectionObserver detectionObserver = new ChineseCharsetDetectionObserver( found, result); det.Init(detectionObserver); imp = new BufferedInputStream(in); byte[] buf = new byte[1024]; int len; boolean isAscii = true; while ((len = imp.read(buf, 0, buf.length)) != -1) { if (isAscii) isAscii = det.isAscii(buf, len); if (!isAscii) { if (det.DoIt(buf, len, false)) break; } } det.DataEnd(); boolean isFound = detectionObserver.isFound(); if (isAscii) { isFound = true; prob = new String[] { "ASCII" }; } else if (isFound) { prob = new String[] { detectionObserver.getResult() }; } else { prob = det.getProbableCharsets(); } return prob; } finally { IOUtils.closeQuietly(imp); IOUtils.closeQuietly(in); } } }
测试:
String file = "C:/3737001.xml"; String[] probableSet = DetectorUtils.detectChineseCharset(new FileInputStream(file)); for (String charset : probableSet) { System.out.println(charset); }
?依赖的jar参见附件
原文:http://snv.iteye.com/blog/2149873