当前位置 : 主页 > 编程语言 > c++ >

CHARDET JAVA

来源:互联网 收集:自由互联 发布时间:2021-07-03
CHARDET.JAVA // Copyright (C) 2010 Google Inc.//// Licensed under the Apache License, Version 2.0 (the "License");// you may not use this file except in compliance with the License.// You may obtain a copy of the License at//// http://www.a
CHARDET.JAVA
// Copyright (C) 2010 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package com.google.caja.lexer;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.StringReader;
import java.io.UnsupportedEncodingException;
import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;

import com.google.caja.SomethingWidgyHappenedError;
import com.google.caja.util.Pair;

import org.mozilla.intl.chardet.nsDetector;
import org.mozilla.intl.chardet.nsICharsetDetectionObserver;
import org.mozilla.intl.chardet.nsPSMDetector;

/**
 * Utilities for dealing with converting byte streams with unknown character
 * sets to character streams.
 *
 * @author Mike Samuel 
 
  
 */
public final class Chardet {
  private Chardet() { /* uninstantiable */ }

  private static final String UTF8 = "UTF-8";
  private static final String UTF16BE = "UTF-16BE";
  private static final String UTF16LE = "UTF-16LE";
  private static final String UTF32BE = "UTF-32BE";
  private static final String UTF32LE = "UTF-32LE";
  private static final String UTF7 = "UTF-7";
  private static final String UTF1 = "UTF-1";
  private static final String ISO_8859_1 = "ISO-8859-1";

  /**
   * Given a byte stream, figure out an encoding and return a character stream
   * and the encoding used to convert bytes to characters.
   */
  public static Pair
  
    guessCharset(InputStream in) throws IOException { ByteArrayOutputStream buffered = new ByteArrayOutputStream(); byte[] buf = new byte[1024]; boolean isAscii = true; int len = in.read(buf); if (len <= 0) { return Pair.pair((Reader) new StringReader(""), UTF8); } String charset = findCharset(buf, len); if (charset != null) { // If the charset is specified in the document, use that. buffered.write(buf, 0, len); // Otherwise, look for a BOM at the start of the content. } else if (hasUtf8BOM(buf, len)) { charset = UTF8; buffered.write(buf, 3, len - 3); // Check UTF32 before UTF16 since a little endian UTF16 BOM is a prefix of // a little endian UTF32 BOM. } else if (hasUtf32BEBOM(buf, len)) { charset = UTF32BE; buffered.write(buf, 4, len - 4); } else if (hasUtf32LEBOM(buf, len)) { charset = UTF32LE; buffered.write(buf, 4, len - 4); } else if (hasUtf16BEBOM(buf, len)) { charset = UTF16BE; buffered.write(buf, 2, len - 2); } else if (hasUtf16LEBOM(buf, len)) { charset = UTF16LE; buffered.write(buf, 2, len - 2); } else if (hasUtf7BOM(buf, len)) { charset = UTF7; buffered.write(buf, 4, len - 4); } else if (hasUtf1BOM(buf, len)) { charset = UTF1; buffered.write(buf, 3, len - 3); } else { // Use jchardet which tries a variety of heuristics to choose an encoding. nsDetector det = new nsDetector(nsPSMDetector.ALL); // The below is adapted from the main method in HtmlCharsetDetector. Observer observer = new Observer(); det.Init(observer); do { buffered.write(buf, 0, len); if (isAscii) { isAscii = det.isAscii(buf, len); } if (!isAscii) { if (det.DoIt(buf, len, false)) { break; } } } while ((len = in.read(buf)) > 0); det.DataEnd(); charset = observer.charset; } if (charset != null) { charset = supportedCharsetName(charset); } if (charset == null) { charset = UTF8; } return Pair.pair( joinStreamsWithCharset(buffered.toByteArray(), in, charset), charset); } static final class Observer implements nsICharsetDetectionObserver { String charset; public void Notify(String charset) { this.charset = charset; } } private static final byte[] CHARSET_BYTES; private static final byte[] ENCODING_BYTES; static { try { CHARSET_BYTES = "charset".getBytes(ISO_8859_1); ENCODING_BYTES = "encoding".getBytes(ISO_8859_1); } catch (UnsupportedEncodingException ex) { throw new SomethingWidgyHappenedError(ex); } } /** * Looks for sequences like {@code charset="..."} inside angle brackets to * match {@code } and after {@code 
   ') { i = pos - 1; break; } } else if (b == '<' || b == '>') { i = j - 1; break; } lastByte = buf[j]; } } return null; } /** * Produces a character stream from an underlying byte stream. * @param buffered lookahead bytes read from tail. * @param tail the unread portion of the stream * @param charset the character set to use to decode the bytes in buffered and * tail. */ private static Reader joinStreamsWithCharset( byte[] buffered, InputStream tail, String charset) throws IOException { return new InputStreamReader(new JoinedStream(buffered, tail), charset); } static final class JoinedStream extends InputStream { byte[] buffered; int pos; final InputStream tail; JoinedStream(byte[] buffered, InputStream tail) { this.buffered = buffered; this.tail = tail; } @Override public int read() throws IOException { if (buffered != null) { if (pos < buffered.length) { return buffered[pos++]; } buffered = null; } return tail.read(); } @Override public int read(byte[] out, int off, int len) throws IOException { int nRead = 0; if (buffered != null) { int avail = buffered.length - pos; if (avail != 0) { int k = Math.min(len, avail); int p1 = pos + k; int p2 = off + k; pos = p1; while (--p2 >= off) { out[p2] = buffered[--p1]; } off += k; len -= k; nRead = k; } else { buffered = null; } } if (len == 0) { return nRead; } int nFromTail = tail.read(out, off, len); if (nFromTail > 0) { return nFromTail + nRead; } return nRead != 0 ? nRead : -1; } @Override public void close() throws IOException { buffered = null; tail.close(); } } private static boolean isAlnum(byte b) { if (b < '0' || b > 'z') { return false; } if (b < 'A') { return b <= '9'; } return b >= 'a' || b <= 'Z'; } private static boolean isSpace(byte b) { return b <= ' ' && (b == ' ' || b == '\r' || b == '\n' || b == '\t' || b == '\f'); } static String supportedCharsetName(String s) { try { return Charset.forName(s).name(); } catch (UnsupportedCharsetException ex) { return null; } catch (IllegalCharsetNameException ex) { return null; } } private static final byte _00 = (byte) 0, _2B = (byte) 0x2b, _2F = (byte) 0x2f, _38 = (byte) 0x38, _39 = (byte) 0x39, _4C = (byte) 0x4c, _64 = (byte) 0x64, _76 = (byte) 0x76, _BB = (byte) 0xbb, _BF = (byte) 0xbf, _EF = (byte) 0xef, _F7 = (byte) 0xf7, _FE = (byte) 0xfe, _FF = (byte) 0xff; // See http://en.wikipedia.org/wiki/Byte_order_mark for a table of byte // sequences. private static boolean hasUtf8BOM(byte[] b, int len) { return len >= 3 && b[0] == _EF && b[1] == _BB && b[2] == _BF; } private static boolean hasUtf16BEBOM(byte[] b, int len) { return len >= 2 && b[0] == _FE && b[1] == _FF; } private static boolean hasUtf16LEBOM(byte[] b, int len) { return len >= 2 && b[0] == _FF && b[1] == _FE; } private static boolean hasUtf32BEBOM(byte[] b, int len) { return len >= 4 && b[0] == _00 && b[1] == _00 && b[2] == _FE && b[3] == _FF; } private static boolean hasUtf32LEBOM(byte[] b, int len) { return len >= 4 && b[0] == _FF && b[1] == _FE && b[2] == _00 && b[3] == _00; } private static boolean hasUtf7BOM(byte[] b, int len) { if (len < 4 || b[0] != _2B || b[1] != _2F || b[2] != _76) { return false; } byte b3 = b[3]; return b3 == _38 || b3 == _39 || b3 == _2B || b3 == _2F; } private static boolean hasUtf1BOM(byte[] b, int len) { return len >= 3 && b[0] == _F7 && b[1] == _64 && b[2] == _4C; } }
  
 
网友评论