当前位置 : 主页 > 编程语言 > c++ >

java计算word文档(doc、docx)中的字符数(含空格)。目前自动编号的计算字符尚

来源:互联网 收集:自由互联 发布时间:2021-06-30
maven依赖 org.apache.poi poi-scratchpad 3.14 org.apache.commons commons-lang3 3.3.2 doc文档字符计数 import org.apache.commons.lang3.ArrayUtils;import org.apache.poi.hwpf.extractor.WordExtractor;import java.io.File;import java.io.Fil
maven依赖
 
            
  
   org.apache.poi
  
            
  
   poi-scratchpad
  
            
  
   3.14
  
        
 
        
 
            
  
   org.apache.commons
  
            
  
   commons-lang3
  
            
  
   3.3.2
  
        
 
doc文档字符计数
import org.apache.commons.lang3.ArrayUtils;
import org.apache.poi.hwpf.extractor.WordExtractor;

import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;

/**
 * Created by chunsw@aliyun.com on 16/5/23.
 */
public class CountDoc {
    static int[] wordCountNew(String doc, boolean isDebug) throws Exception {
        long time = System.currentTimeMillis();
        InputStream is = new FileInputStream(new File(doc));
        WordExtractor ex = new WordExtractor(is);
        int cnt = 0;
        StringBuilder builder = new StringBuilder();
        for (String text : ex.getParagraphText()) {
//            text = text.replaceAll("\u0007", "").replaceAll("\f", "")
//                    .replaceAll("\r", "").replaceAll("\n", "")
//                    .replaceAll("\u0015", "");
            if (isDebug) {
                text = trimAllChars(text, new char[] { '\u0007', '\f', '\b', '\u0015' });
            } else {
                text = trimAllChars(text, new char[] { '\u0007', '\f', '\b', '\u0015', '\r', '\n' });
            }

            String prefix = " TOC \\o \\u \u0014";
            if (text.startsWith(prefix))
                text = text.substring(prefix.length());
//            flag = "\u0013 EMBED Visio.Drawing.11 \u0014\u0001";
//            flag = "\u0013 EMBED Word.Document.12 \\s \u0014\u0001";
            int start = text.indexOf("\u0013");
            int end = text.indexOf("\u0014\u0001");
            if (start >= 0 && end > start) {
                text = text.replaceAll("\u0013[^\u0014\u0001]+\u0014\u0001", "");
            }
            text = text.replaceAll("\u0013[^\u0014\u0013]+\u0014", "");

            String flag = "\u0013 HYPERLINK";
            int pos = text.indexOf(flag);
            if (pos >= 0) {
                String[] arr = text.split(" \u0014");
                text = text.substring(0, pos) + arr[1];
            }

            if (text.length() >= 767) {
                // word doc格式时, 如果连续字符数数大于767个(大于等于768), 则该段落的字数不计入
//                if (text.replaceAll(" ", "").length() < text.length() - 767) { //
                text = text.replaceAll(" {767,}", "");
//                }
            }

            if (isDebug)
                builder.append(text);
            cnt += text.length();
        }

        int t = Long.valueOf(System.currentTimeMillis() - time).intValue();

        if (isDebug) {
            System.out.println(builder.toString()); // .replaceAll("\r", "").replaceAll("\n", "")
            System.out.println(cnt);
            System.out.println(t + " ms");
        }
        return new int[] { cnt, t };
    }

    private static String trimAllChars(String text, char[] chars) {
        if (text == null || text.isEmpty())
            return text;
        StringBuilder builder = new StringBuilder();
        for (int i = 0; i < text.length(); i++) {
            if (!ArrayUtils.contains(chars, text.charAt(i)))
                builder.append(text.charAt(i));
        }
        return builder.toString();
    }
}
docx文档字符计数
package com.github.wangshichun.util.word.count;

import org.apache.commons.lang3.StringUtils;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

import java.io.*;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;

/**
 * Created by chunsw@aliyun.com on 16/5/23.
 */
public class CountDocx {
    static int[] wordCountNew(Object xmlSource, boolean isDebug) throws Exception {
        long time = System.currentTimeMillis();
        XMLReader parser = XMLReaderFactory.createXMLReader();
        final Integer[] cnt = {0};
        final Integer[] sectPrCount = {0};
        final Integer[] brCount = {0};
        final Integer[] numPrCount = {0};
        Map
 
   numIdMap = new HashMap
  
   (); StringBuilder stringBuilder2 = new StringBuilder(); if ((xmlSource instanceof String && xmlSource.toString().endsWith(".docx")) || xmlSource instanceof InputStream) { // System.out.println("in zip file"); ZipInputStream zipInputStream = new ZipInputStream(xmlSource instanceof InputStream ? (InputStream) xmlSource : new FileInputStream((String) xmlSource)); NoCloseInputStream noCloseInputStream = new NoCloseInputStream(new BufferedInputStream(zipInputStream)); ZipEntry zipEntry; while ((zipEntry = zipInputStream.getNextEntry()) != null) { // 项目符号和编号的格式定义(例如: 多级列表的一级为`
   ` 或 `%1.`, 二级为`%1.%2`)在"word/numbering.xml"中, 暂不处理 if ("word/document.xml".equals(zipEntry.getName())) { parser.setContentHandler(new DocumentXMLHandler(cnt, sectPrCount, brCount, numPrCount, numIdMap, stringBuilder2, isDebug)); parser.parse(new InputSource(noCloseInputStream)); } if ("word/endnotes.xml".equals(zipEntry.getName())) { parser.setContentHandler(new EndNotesXMLHandler(cnt, stringBuilder2, isDebug)); parser.parse(new InputSource(noCloseInputStream)); } } noCloseInputStream.doClose(); zipInputStream.close(); } else { parser.setContentHandler(new DocumentXMLHandler(cnt, sectPrCount, brCount, numPrCount, numIdMap, stringBuilder2, isDebug)); parser.parse(xmlSource.toString()); } int seqCnt = 0; for (AtomicInteger temp : numIdMap.values()) { if (temp.get() < 10) continue; if (temp.get() < 100) { seqCnt = seqCnt + temp.get() - 9; } else if (temp.get() < 1000) { seqCnt += 90; seqCnt += (temp.get() - 99) * 2; } else { seqCnt += 1890; seqCnt += (temp.get() - 999) * 3; } } cnt[0] += numPrCount[0]; int len = cnt[0] - sectPrCount[0] + 1 + brCount[0] + seqCnt; int t = Long.valueOf(System.currentTimeMillis() - time).intValue(); if (isDebug) { System.out.println(stringBuilder2); System.out.println(len); System.out.println(t + " ms"); } return new int[]{len, t}; } static class NoCloseInputStream extends FilterInputStream { public NoCloseInputStream(InputStream is) { super(is); } public void close() throws IOException { } public void doClose() throws IOException { super.close(); } } static class EndNotesXMLHandler extends DefaultHandler { private boolean inTextElement = false; private Integer[] cnt; private StringBuilder stringBuilder2; private boolean isDebug; EndNotesXMLHandler(Integer[] cnt, StringBuilder stringBuilder2, boolean isDebug) { this.cnt = cnt; this.stringBuilder2 = stringBuilder2; this.isDebug = isDebug; } @Override public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { // Using qualified name because we are not using xmlns prefixes here. if (qName.equals("w:t")) { inTextElement = true; } } @Override public void endElement(String namespaceURI, String localName, String qName) throws SAXException { if (qName.equals("w:t")) { inTextElement = false; } } @Override public void characters(char[] ch, int start, int length) { if (!inTextElement) return; cnt[0] += length; if (isDebug) { String content = new String(ch, start, length); stringBuilder2.append(content); } } } static class DocumentXMLHandler extends DefaultHandler { private boolean inTabs = false; private boolean inPPr = false; private boolean inNumPr = false; private boolean inTextElement = false; private boolean hasPStyle = false; private Integer[] cnt; private Integer[] sectPrCount; private Integer[] brCount; private Integer[] numPrCount; private Integer ilvl = null; private Map
   
     numIdMap; private StringBuilder stringBuilder2; private boolean isDebug; private boolean inPicture = false; private Integer pStyle = null; DocumentXMLHandler(Integer[] cnt, Integer[] sectPrCount, Integer[] brCount, Integer[] numPrCount, Map
    
      numIdMap, StringBuilder stringBuilder2, boolean isDebug) { this.cnt = cnt; this.sectPrCount = sectPrCount; this.brCount = brCount; this.numPrCount = numPrCount; this.numIdMap = numIdMap; this.stringBuilder2 = stringBuilder2; this.isDebug = isDebug; numIdMap.clear(); } @Override public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { // Using qualified name because we are not using xmlns prefixes here. if (qName.equals("w:tabs")) { inTabs = true; } else if (qName.equals("w:tab")) { if (!inTabs) cnt[0]++; } else if (qName.equals("w:sectPr")) { sectPrCount[0]++; } else if (qName.equals("w:br")) { if (atts.getLength() == 0) brCount[0]++; } else if (qName.equals("w:t")) { inTextElement = true; } else if (qName.equals("w:pPr")) { inPPr = true; } else if (qName.equals("w:pStyle")) { String val = atts.getValue("w:val"); if (StringUtils.isNumeric(val)) { pStyle = Integer.valueOf(val); hasPStyle = true; } } else if (qName.equals("w:numPr")) { inNumPr = true; } else if (qName.equals("w:ilvl")) { if (inNumPr) { String val = atts.getValue("w:val"); ilvl = Integer.valueOf(val); numPrCount[0] += (ilvl + 1) * 2; } } else if (qName.equals("w:numId")) { if (inNumPr && hasPStyle) { String val = atts.getValue("w:val") + "_" + ilvl; numIdMap.putIfAbsent(val, new AtomicInteger(0)); numIdMap.get(val).incrementAndGet(); } } else if (qName.equals("w:pict")) { inPicture = true; } } @Override public void endElement(String namespaceURI, String localName, String qName) throws SAXException { if (qName.equals("w:tabs")) { inTabs = false; } else if (qName.equals("w:pPr")) { inPPr = false; hasPStyle = false; pStyle = null; } else if (qName.equals("w:numPr")) { inNumPr = false; ilvl = null; } else if (qName.equals("w:t")) { inTextElement = false; } else if (qName.equals("w:pict")) { inPicture = false; } } @Override public void characters(char[] ch, int start, int length) { if (!inTextElement || inPicture) return; if (length >= 767) { // word doc格式时, 如果连续字符数数大于767个(大于等于768), 则该段落的字数不计入 String text = new String(ch, start, length); text = text.replaceAll(" {767,}", ""); length = text.length(); } cnt[0] += length; if (isDebug) { String text = new String(ch, start, length); stringBuilder2.append(text); } } public void ignorableWhitespace(char ch[], int start, int length) throws SAXException { } } }
    
   
  
 
网友评论