gistfile1.txt package utils;/**version: 1.1 / 2007-01-25- changed BOM recognition ordering (longer boms first)Original pseudocode : Thomas WeidenfellerImplementation tweaked: Aki Nieminenhttp://www.unicode.org/unicode/faq/utf_bom.htmlBOMs i
package utils; /** version: 1.1 / 2007-01-25 - changed BOM recognition ordering (longer boms first) Original pseudocode : Thomas Weidenfeller Implementation tweaked: Aki Nieminen http://www.unicode.org/unicode/faq/utf_bom.html BOMs in byte length ordering: 00 00 FE FF = UTF-32, big-endian FF FE 00 00 = UTF-32, little-endian EF BB BF = UTF-8, FE FF = UTF-16, big-endian FF FE = UTF-16, little-endian Win2k Notepad: Unicode format = UTF-16LE ***/ import java.io.*; /** * This inputstream will recognize unicode BOM marks and will skip bytes if * getEncoding() method is called before any of the read(...) methods. * * Usage pattern: String enc = "ISO-8859-1"; // or NULL to use systemdefault * FileInputStream fis = new FileInputStream(file); UnicodeInputStream uin = new * UnicodeInputStream(fis, enc); enc = uin.getEncoding(); // check and skip * possible BOM bytes InputStreamReader in; if (enc == null) in = new * InputStreamReader(uin); else in = new InputStreamReader(uin, enc); */ public class UnicodeInputStream extends InputStream { PushbackInputStream internalIn; boolean isInited = false; String defaultEnc; String encoding; private static final int BOM_SIZE = 4; UnicodeInputStream(InputStream in, String defaultEnc) { internalIn = new PushbackInputStream(in, BOM_SIZE); this.defaultEnc = defaultEnc; } public String getDefaultEncoding() { return defaultEnc; } public String getEncoding() { if (!isInited) { try { init(); } catch (IOException ex) { IllegalStateException ise = new IllegalStateException("Init method failed."); ise.initCause(ise); throw ise; } } return encoding; } /** * Read-ahead four bytes and check for BOM marks. Extra bytes are unread * back to the stream, only BOM bytes are skipped. */ protected void init() throws IOException { if (isInited) return; byte bom[] = new byte[BOM_SIZE]; int n, unread; n = internalIn.read(bom, 0, bom.length); if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) { encoding = "UTF-32BE"; unread = n - 4; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) { encoding = "UTF-32LE"; unread = n - 4; } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) { encoding = "UTF-8"; unread = n - 3; } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { encoding = "UTF-16BE"; unread = n - 2; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { encoding = "UTF-16LE"; unread = n - 2; } else { // Unicode BOM mark not found, unread all bytes encoding = defaultEnc; unread = n; } // System.out.println("read=" + n + ", unread=" + unread); if (unread > 0) internalIn.unread(bom, (n - unread), unread); isInited = true; } public void close() throws IOException { // init(); isInited = true; internalIn.close(); } public int read() throws IOException { // init(); isInited = true; return internalIn.read(); } }gistfile2.txt
package utils; /** version: 1.1 / 2007-01-25 - changed BOM recognition ordering (longer boms first) Original pseudocode : Thomas Weidenfeller Implementation tweaked: Aki Nieminen http://www.unicode.org/unicode/faq/utf_bom.html BOMs: 00 00 FE FF = UTF-32, big-endian FF FE 00 00 = UTF-32, little-endian EF BB BF = UTF-8, FE FF = UTF-16, big-endian FF FE = UTF-16, little-endian Win2k Notepad: Unicode format = UTF-16LE ***/ import java.io.*; /** * Generic unicode textreader, which will use BOM mark to identify the encoding * to be used. If BOM is not found then use a given default or system encoding. */ public class UnicodeReader extends Reader { PushbackInputStream internalIn; InputStreamReader internalIn2 = null; String defaultEnc; private static final int BOM_SIZE = 4; /** * * @param in * inputstream to be read * @param defaultEnc * default encoding if stream does not have BOM marker. Give NULL * to use system-level default. */ UnicodeReader(InputStream in, String defaultEnc) { internalIn = new PushbackInputStream(in, BOM_SIZE); this.defaultEnc = defaultEnc; } public String getDefaultEncoding() { return defaultEnc; } /** * Get stream encoding or NULL if stream is uninitialized. Call init() or * read() method to initialize it. */ public String getEncoding() { if (internalIn2 == null) return null; return internalIn2.getEncoding(); } /** * Read-ahead four bytes and check for BOM marks. Extra bytes are unread * back to the stream, only BOM bytes are skipped. */ protected void init() throws IOException { if (internalIn2 != null) return; String encoding; byte bom[] = new byte[BOM_SIZE]; int n, unread; n = internalIn.read(bom, 0, bom.length); if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00) && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) { encoding = "UTF-32BE"; unread = n - 4; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE) && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) { encoding = "UTF-32LE"; unread = n - 4; } else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB) && (bom[2] == (byte) 0xBF)) { encoding = "UTF-8"; unread = n - 3; } else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) { encoding = "UTF-16BE"; unread = n - 2; } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) { encoding = "UTF-16LE"; unread = n - 2; } else { // Unicode BOM mark not found, unread all bytes encoding = defaultEnc; unread = n; } // System.out.println("read=" + n + ", unread=" + unread); if (unread > 0) internalIn.unread(bom, (n - unread), unread); // Use given encoding if (encoding == null) { internalIn2 = new InputStreamReader(internalIn); } else { internalIn2 = new InputStreamReader(internalIn, encoding); } } public void close() throws IOException { init(); internalIn2.close(); } public int read(char[] cbuf, int off, int len) throws IOException { init(); return internalIn2.read(cbuf, off, len); } }gistfile3.txt
package utils; import com.xiaoleilu.hutool.io.FileUtil; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.util.ArrayList; /** * Created by DimonHo on 2017/12/27. */ public class FileUtils extends FileUtil { /** * 按行读取文件全部内容 * @param file * @return 注意文件首行第一个字符实际上会有一个空字符,表示文件BOM。 */ public static ArrayListreadBomLines(File file){ ArrayList lines = new ArrayList (); try { //FileReader reader = new FileReader(file); //会有首行bom字符问题存在,改用下面的方法,可避免首行第一个bom字符问题。 UnicodeReader r = new UnicodeReader(new FileInputStream(file), null); BufferedReader bufferedReader = new BufferedReader(r); String str = null; while((str = bufferedReader.readLine()) != null) { lines.add(str); } bufferedReader.close(); r.close(); } catch (Exception e) { e.printStackTrace(); } return lines; } }