Java读带有BOM的UTF-8文件乱码

package com.java.io;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.io.PushbackInputStream;import java.io.Reader;/**version: 1.1 / 2007-01-25- changed BOM recognition ordering (longer boms first)网络地址:Original pseudocode : Thomas WeidenfellerImplementation tweaked: Aki NieminenBOMs: 00 00 FE FF = UTF-32, big-endian FF FE 00 00 = UTF-32, little-endian EF BB BF= UTF-8, FE FF= UTF-16, big-endian FF FE= UTF-16, little-endianWin2k Notepad: Unicode format = UTF-16LE***//*** Generic unicode textreader, which will use BOM mark* to identify the encoding to be used. If BOM is not found* then use a given default or system encoding.*/public class UnicodeReader extends Reader { PushbackInputStream internalIn; InputStreamReader internalIn2 = null; StringdefaultEnc; private static final int BOM_SIZE = 4; /** * * @param in inputstream to be read * @param defaultEnc default encoding if stream does not have *BOM marker. Give NULL to use system-level default. */ UnicodeReader(InputStream in, String defaultEnc) {internalIn = new PushbackInputStream(in, BOM_SIZE);this.defaultEnc = defaultEnc; } public String getDefaultEncoding() {return defaultEnc; } /** * Get stream encoding or NULL if stream is uninitialized. * Call init() or read() method to initialize it. */ public String getEncoding() {if (internalIn2 == null) return null;return internalIn2.getEncoding(); } /** * Read-ahead four bytes and check for BOM marks. Extra bytes are * unread back to the stream, only BOM bytes are skipped. */ protected void init() throws IOException {if (internalIn2 != null) return;String encoding;byte bom[] = new byte[BOM_SIZE];int n, unread;n = internalIn.read(bom, 0, bom.length);if ( (bom[0] == (byte)0x00) && (bom[1] == (byte)0x00) &&(bom[2] == (byte)0xFE) && (bom[3] == (byte)0xFF) ) {encoding = “UTF-32BE”;unread = n – 4;} else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) &&(bom[2] == (byte)0x00) && (bom[3] == (byte)0x00) ) {encoding = “UTF-32LE”;unread = n – 4;} else if ( (bom[0] == (byte)0xEF) && (bom[1] == (byte)0xBB) &&(bom[2] == (byte)0xBF) ) {encoding = “UTF-8”;unread = n – 3;} else if ( (bom[0] == (byte)0xFE) && (bom[1] == (byte)0xFF) ) {encoding = “UTF-16BE”;unread = n – 2;} else if ( (bom[0] == (byte)0xFF) && (bom[1] == (byte)0xFE) ) {encoding = “UTF-16LE”;unread = n – 2;} else {// Unicode BOM mark not found, unread all bytesencoding = defaultEnc;unread = n;}//System.out.println(“read=” + n + “, unread=” + unread);if (unread > 0) internalIn.unread(bom, (n – unread), unread);// Use given encodingif (encoding == null) {internalIn2 = new InputStreamReader(internalIn);} else {internalIn2 = new InputStreamReader(internalIn, encoding);} } public void close() throws IOException {init();internalIn2.close(); } public int read(char[] cbuf, int off, int len) throws IOException {init();return internalIn2.read(cbuf, off, len); }}

测试类

package com.java.io;import java.io.BufferedReader;import java.io.File;import java.io.FileInputStream;import java.io.InputStreamReader;import java.nio.charset.Charset;public class BomRead {/*** 读带有BOM的UTF-8文件乱码* @param args*/public static void main(String[] args)throws Exception {File file = new File(“E:\\JS_Exercise\\JavaExercise\\BOM.txt”);FileInputStream in = new FileInputStream(file);BufferedReader br = new BufferedReader(new InputStreamReader(in, “UTF-8”));String line = null;System.out.println(“处理前:”);while((line = br.readLine()) != null){System.out.println(line);}File file2 = new File(“E:\\JS_Exercise\\JavaExercise\\BOM.txt”);FileInputStream in2 = new FileInputStream(file2);BufferedReader br2 = new BufferedReader(new UnicodeReader(in2, “UTF-8”));String line2 = null;System.out.println(“处理后:”);while((line2 = br2.readLine()) != null){System.out.println(line2);}}}

输出结果

因此在遇到此问题的时候可以特殊问题特殊处理。

本文出自 “风云海滩” 博客,请务必保留此出处

,放下一种执着,收获一种自在。放下既是一种理性抉择,也是一种豁达美。

Java读带有BOM的UTF-8文件乱码

相关文章:

你感兴趣的文章:

标签云: