maven依赖 org.apache.poi poi-scratchpad 3.14 org.apache.commons commons-lang3 3.3.2 doc文档字符计数 import org.apache.commons.lang3.ArrayUtils;import org.apache.poi.hwpf.extractor.WordExtractor;import java.io.File;import java.io.Fil
          doc文档字符计数org.apache.poi poi-scratchpad3.14 org.apache.commons commons-lang33.3.2 
import org.apache.commons.lang3.ArrayUtils;
import org.apache.poi.hwpf.extractor.WordExtractor;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
/**
 * Created by chunsw@aliyun.com on 16/5/23.
 */
public class CountDoc {
    static int[] wordCountNew(String doc, boolean isDebug) throws Exception {
        long time = System.currentTimeMillis();
        InputStream is = new FileInputStream(new File(doc));
        WordExtractor ex = new WordExtractor(is);
        int cnt = 0;
        StringBuilder builder = new StringBuilder();
        for (String text : ex.getParagraphText()) {
//            text = text.replaceAll("\u0007", "").replaceAll("\f", "")
//                    .replaceAll("\r", "").replaceAll("\n", "")
//                    .replaceAll("\u0015", "");
            if (isDebug) {
                text = trimAllChars(text, new char[] { '\u0007', '\f', '\b', '\u0015' });
            } else {
                text = trimAllChars(text, new char[] { '\u0007', '\f', '\b', '\u0015', '\r', '\n' });
            }
            String prefix = " TOC \\o \\u \u0014";
            if (text.startsWith(prefix))
                text = text.substring(prefix.length());
//            flag = "\u0013 EMBED Visio.Drawing.11 \u0014\u0001";
//            flag = "\u0013 EMBED Word.Document.12 \\s \u0014\u0001";
            int start = text.indexOf("\u0013");
            int end = text.indexOf("\u0014\u0001");
            if (start >= 0 && end > start) {
                text = text.replaceAll("\u0013[^\u0014\u0001]+\u0014\u0001", "");
            }
            text = text.replaceAll("\u0013[^\u0014\u0013]+\u0014", "");
            String flag = "\u0013 HYPERLINK";
            int pos = text.indexOf(flag);
            if (pos >= 0) {
                String[] arr = text.split(" \u0014");
                text = text.substring(0, pos) + arr[1];
            }
            if (text.length() >= 767) {
                // word doc格式时, 如果连续字符数数大于767个(大于等于768), 则该段落的字数不计入
//                if (text.replaceAll(" ", "").length() < text.length() - 767) { //
                text = text.replaceAll(" {767,}", "");
//                }
            }
            if (isDebug)
                builder.append(text);
            cnt += text.length();
        }
        int t = Long.valueOf(System.currentTimeMillis() - time).intValue();
        if (isDebug) {
            System.out.println(builder.toString()); // .replaceAll("\r", "").replaceAll("\n", "")
            System.out.println(cnt);
            System.out.println(t + " ms");
        }
        return new int[] { cnt, t };
    }
    private static String trimAllChars(String text, char[] chars) {
        if (text == null || text.isEmpty())
            return text;
        StringBuilder builder = new StringBuilder();
        for (int i = 0; i < text.length(); i++) {
            if (!ArrayUtils.contains(chars, text.charAt(i)))
                builder.append(text.charAt(i));
        }
        return builder.toString();
    }
} 
 docx文档字符计数
 
package com.github.wangshichun.util.word.count;
import org.apache.commons.lang3.StringUtils;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;
import java.io.*;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
/**
 * Created by chunsw@aliyun.com on 16/5/23.
 */
public class CountDocx {
    static int[] wordCountNew(Object xmlSource, boolean isDebug) throws Exception {
        long time = System.currentTimeMillis();
        XMLReader parser = XMLReaderFactory.createXMLReader();
        final Integer[] cnt = {0};
        final Integer[] sectPrCount = {0};
        final Integer[] brCount = {0};
        final Integer[] numPrCount = {0};
        Map
 
   numIdMap = new HashMap
  
   (); StringBuilder stringBuilder2 = new StringBuilder(); if ((xmlSource instanceof String && xmlSource.toString().endsWith(".docx")) || xmlSource instanceof InputStream) { // System.out.println("in zip file"); ZipInputStream zipInputStream = new ZipInputStream(xmlSource instanceof InputStream ? (InputStream) xmlSource : new FileInputStream((String) xmlSource)); NoCloseInputStream noCloseInputStream = new NoCloseInputStream(new BufferedInputStream(zipInputStream)); ZipEntry zipEntry; while ((zipEntry = zipInputStream.getNextEntry()) != null) { // 项目符号和编号的格式定义(例如: 多级列表的一级为`
   
     numIdMap; private StringBuilder stringBuilder2; private boolean isDebug; private boolean inPicture = false; private Integer pStyle = null; DocumentXMLHandler(Integer[] cnt, Integer[] sectPrCount, Integer[] brCount, Integer[] numPrCount, Map
    
      numIdMap, StringBuilder stringBuilder2, boolean isDebug) { this.cnt = cnt; this.sectPrCount = sectPrCount; this.brCount = brCount; this.numPrCount = numPrCount; this.numIdMap = numIdMap; this.stringBuilder2 = stringBuilder2; this.isDebug = isDebug; numIdMap.clear(); } @Override public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { // Using qualified name because we are not using xmlns prefixes here. if (qName.equals("w:tabs")) { inTabs = true; } else if (qName.equals("w:tab")) { if (!inTabs) cnt[0]++; } else if (qName.equals("w:sectPr")) { sectPrCount[0]++; } else if (qName.equals("w:br")) { if (atts.getLength() == 0) brCount[0]++; } else if (qName.equals("w:t")) { inTextElement = true; } else if (qName.equals("w:pPr")) { inPPr = true; } else if (qName.equals("w:pStyle")) { String val = atts.getValue("w:val"); if (StringUtils.isNumeric(val)) { pStyle = Integer.valueOf(val); hasPStyle = true; } } else if (qName.equals("w:numPr")) { inNumPr = true; } else if (qName.equals("w:ilvl")) { if (inNumPr) { String val = atts.getValue("w:val"); ilvl = Integer.valueOf(val); numPrCount[0] += (ilvl + 1) * 2; } } else if (qName.equals("w:numId")) { if (inNumPr && hasPStyle) { String val = atts.getValue("w:val") + "_" + ilvl; numIdMap.putIfAbsent(val, new AtomicInteger(0)); numIdMap.get(val).incrementAndGet(); } } else if (qName.equals("w:pict")) { inPicture = true; } } @Override public void endElement(String namespaceURI, String localName, String qName) throws SAXException { if (qName.equals("w:tabs")) { inTabs = false; } else if (qName.equals("w:pPr")) { inPPr = false; hasPStyle = false; pStyle = null; } else if (qName.equals("w:numPr")) { inNumPr = false; ilvl = null; } else if (qName.equals("w:t")) { inTextElement = false; } else if (qName.equals("w:pict")) { inPicture = false; } } @Override public void characters(char[] ch, int start, int length) { if (!inTextElement || inPicture) return; if (length >= 767) { // word doc格式时, 如果连续字符数数大于767个(大于等于768), 则该段落的字数不计入 String text = new String(ch, start, length); text = text.replaceAll(" {767,}", ""); length = text.length(); } cnt[0] += length; if (isDebug) { String text = new String(ch, start, length); stringBuilder2.append(text); } } public void ignorableWhitespace(char ch[], int start, int length) throws SAXException { } } }
     
    
   
  
        
        