当前位置 : 主页 > 编程语言 > java >

文本分类成带变量的模版

来源:互联网 收集:自由互联 发布时间:2021-06-28
Text2Template.java /** * */package gist.util.text;import java.util.ArrayList;import java.util.Arrays;import java.util.Collections;import java.util.Comparator;import java.util.HashMap;import java.util.HashSet;import java.util.Iterator;import
Text2Template.java
/**
 * 
 */
package gist.util.text;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.TreeSet;
import java.util.regex.Pattern;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
 * 
 * Feature:输入一组文本内容, Text2Template将其分类到几组模版中.
 * [txtA, txtB, txtC, ...] -> Text2Template -> {tpl1:[txtA, txtB], tpl2:[txtC, ...], ...}
 * 
 * For example:
 * txtA = 20171108订单号111从仓库发出
 * txtB = 20171109订单号222从仓库发出
 * txtC = 订单号222预计送达日期20171110
 * [txtA, txtB, txtC] -> {#date#订单号#no#从仓库发出:[txtA, txtB]}
 * 
* *
 * (实现可能和这里定义有差异)
 * - 预处理: 去掉噪音
 * - 归类内容: 签名相同,长度相差lo~10
 * - 冒泡比较: 产出归类模版最少的一组模版
 * 文本向量: 所有字符归到一个排序数组,数组索引替换文本。 模版问题->向量的最佳压缩比?
 * 变量分析: TODO 判断条件
 *  字符与其他文本在索引[-5,+5]内存在相同的表示类似,否则记为变量开始位置,当满足类似特性时记为变量结束位置
 *  不同的连续数字作为变量
 * 归类模版: 使用变量替换文本产生模版,分析每个模版的变量,依据最佳变量得出需要的模版
 * 
 * 什么是最佳变量:
 *   变量对应的向量离散值最小
 * 
* * TODO 性能 * * @author dzh * @date Nov 8, 2017 3:01:24 PM * @since 1.0.0 */ public class Text2Template { static Logger LOG = LoggerFactory.getLogger(Text2Template.class); private List texts; private Cleaner cleaner; private Matcher matcher; private char[] charsTable; private TestExporter t; // 测试时使用 private Text2Template() {} public static final Map > transfrom(List texts, Cleaner cleaner, Matcher matcher) { Text2Template t2t = new Text2Template(); return t2t.trasnformImpl(texts, cleaner, matcher); } private Map > trasnformImpl(List texts, Cleaner cleaner, Matcher matcher) { if (texts == null || texts.size() < 2) { return Collections. > emptyMap(); } if (matcher == null) { matcher = new SimpleMatcher(); } if (cleaner == null) { cleaner = new SimpleCleaner(); } this.matcher = matcher; // 预处理 this.cleaner = cleaner; this.initTexts(texts); // 全局字符表,初始化t.vec this.initCharsTable(this.texts); // TODO 归类文本, 将相似的文本放在一起计算模版, 减少匹配次数 // HashSet tplSet = new HashSet (); TemplateCalc calc = new SymmetricTemplateCalc(); for (int i = 0; i < this.texts.size(); i++) { Text bt = this.texts.get(i); for (int j = i + 1; j < this.texts.size(); j++) { Text ct = this.texts.get(j); String tpl = calc.calc(bt.tid, ct.tid); if (tpl != null) { // 合并模版 Iterator iter = tplSet.iterator(); boolean added = true; while (iter.hasNext()) { String tt = iter.next(); if (this.matcher.match(tpl, tt)) { iter.remove(); } else if (this.matcher.match(tt, tpl)) { added = false; break; } } if (added) tplSet.add(tpl); } } } // Map > tpl2Text = new HashMap<>(); for (String tpl : tplSet) { tpl2Text.put(tpl, new LinkedList ()); for (Text t : this.texts) { if (this.matcher.match(tpl, t.txt)) { tpl2Text.get(tpl).add(t.txt); } } } return tpl2Text; } public static final TestExporter testTransfrom(List texts, Cleaner cleaner, Matcher matcher) { Text2Template t2t = new Text2Template(); t2t.trasnformImpl(texts, cleaner, matcher); t2t.t = t2t.new TestExporter(); return t2t.t; } /** * 建立全局字符表 * * @param texts */ void initCharsTable(List texts) { TreeSet set = new TreeSet<>(new Comparator () { @Override public int compare(Character o1, Character o2) { return o1.charValue() - o2.charValue(); } }); for (Text t : texts) { for (char ch : t.clean().toCharArray()) { set.add(ch); } } charsTable = new char[set.size()]; Iterator chIt = set.iterator(); int i = 0; while (chIt.hasNext()) { charsTable[i++] = chIt.next(); } LOG.info("charsTable {}", charsTable); for (Text t : texts) { t.toVec(); } } public Text text(int tid) { return texts.get(tid); } void initTexts(List text) { texts = new ArrayList<>(text.size()); for (int i = 0; i < text.size(); i++) { Text t = new Text(); t.tid = i; t.txt = text.get(i); texts.add(i, t); } } public class Text { public int tid; // index of texts public String txt; // content public int[] vec; // txt的有效向量值:clean()->toVec() public String clean() { if (cleaner == null) return this.txt; return cleaner.clean(this.txt); } public void toVec() { if (vec == null) { String clnTxt = clean(); vec = new int[clnTxt.length()]; if (charsTable != null) { // 用全局字符表的下标作为vec for (int i = 0; i < clnTxt.length(); i++) { vec[i] = Arrays.binarySearch(charsTable, clnTxt.charAt(i)); } } // LOG.info("{} {}", tid, vec); } } public double area() { double area = vec[0] / 2.0; for (int x = 1; x < vec.length; x++) { area += (vec[x - 1] + vec[x]) / 2.0; } return area; } @Override public boolean equals(Object obj) { if (obj != null && obj instanceof Text) { return this.tid == ((Text) obj).tid; } return false; } } public class Template { public String tpl; public List text = new LinkedList<>(); // matched text public int matched() { // matched text return text.size(); } } public static interface Cleaner { String clean(String t); } public static interface Matcher { boolean match(String tpl, String text); } /** * 使用默认的模版格式匹配 * */ public static class SimpleMatcher implements Matcher { @Override public boolean match(String tpl, String text) { String ptpl = tpl.replaceAll("#var[1-9]\\d?#", ".*"); // LOG.info(ptpl); return Pattern.matches("^" + ptpl, text); // TODO 性能优化 } } public static class SimpleCleaner implements Cleaner { @Override public String clean(String t) { return t.trim(); } } public abstract class TemplateCalc { /** * * @param btid * base tid * @param ctid * compared tid * @return tpl or null if not matched */ public abstract String calc(int btid, int ctid); } /** * 数字变量 忽略类似可能的金额、时间戳等 */ public static final Pattern P_VAR_DIGIT = Pattern.compile("(\\d+[.,:]?\\d*)+"); public static final Pattern P_VAR_SHORTURL = Pattern.compile("(?:\\s*http\\://)?(?:\\s*[\\w\\.]*\\.(?:cn|com|io|org)/[\\w\\.]*)?\\s?"); public static final int CONST_MIN_LEN = 3; // 常量最小长度 public static final int VAR_MAX_LEN = 10; // 变量最大长度 /** * 对称匹配,相似的文本互相得出的模版内容是一样的 */ public class SymmetricTemplateCalc extends TemplateCalc { @Override public String calc(int btid, int ctid) { String btp = calcImpl(btid, ctid); // LOG.info("calc {} {} {}", btid, ctid, btp); String ctp = calcImpl(ctid, btid); // LOG.info("calc {} {} {}", ctid, btid, ctp); if (btp.equals(ctp)) return btp; if (btp.contains(ctp)) return btp; if (ctp.contains(btp)) return ctp; return null; } String calcImpl(int btid, int ctid) { int[] bsVec = new int[texts.get(btid).vec.length]; System.arraycopy(texts.get(btid).vec, 0, bsVec, 0, texts.get(btid).vec.length); Scan bs = new Scan(bsVec); int[] csVec = new int[texts.get(ctid).vec.length]; System.arraycopy(texts.get(ctid).vec, 0, csVec, 0, texts.get(ctid).vec.length); Scan cs = new Scan(csVec); while (!bs.isOver()) { Scan bs1 = bs.clone(); Scan cs1 = cs.clone(); // 分析常量 if (bs1.readConst(cs1, 0)) { Scan bs2 = bs.clone(); Scan cs2 = cs.clone(); if (bs2.readConst(cs2, 1) && bs2.constLen() > bs1.constLen()) { // 第二次匹配的效果更好 bs1 = bs2; cs1 = cs2; } if (!ignoreConst(bs1) && !ignoreConst(cs1)) { bs1.markVar(); bs1.marke = bs1.pos - 1; // bs1.marke = bs1.nextMarke(); } else { bs1.mark = -1; } } bs = bs1; } bs.markVar(); bs.compact(); return bs.vecStr(); } private boolean ignoreConst(Scan sc) { String c = sc.constStr(); // LOG.info("{} {}", sc.var(), c); if (P_VAR_DIGIT.matcher(c + (sc.isOver() ? "" : sc.charAt(sc.pos))).matches()) { // LOG.info("{} match digit", c); return true; } if (P_VAR_SHORTURL.matcher(c).matches()) { // LOG.info("{} match shorturl", c); return true; } if (P_VAR_SHORTURL.matcher(sc.var() + c).matches()) { // LOG.info("{} match shorturl", c); return true; } return false; } /** * 常量扫描 * * 常量定义: 至少连续3个字符相同 */ public class Scan { private int[] vec; public Scan(int[] t) { this.vec = t; reset(); } public int mark = -1; // 常量匹配开始位置 public int pos = 0; // 匹配位置 public int var = 0; // 变量名 public int marke = -1; // 最近一次匹配的常量结束位置 public void reset() { mark = -1; pos = 0; var = 0; } /** * 压缩向量表 */ public void compact() { int[] comp = new int[vec.length]; int j = -1; for (int i = 0; i < vec.length; i++) { if (vec[i] >= 0) { comp[++j] = vec[i]; } else { if (j >= 0 && vec[i] == comp[j]) { continue; } else { comp[++j] = vec[i]; } } } vec = new int[j + 1]; System.arraycopy(comp, 0, vec, 0, j + 1); } public Scan clone() { Scan s = new Scan(this.vec); s.mark = this.mark; s.pos = this.pos; s.var = this.var; s.marke = this.marke; return s; } public char charAt(int pos) { return charsTable[vec[pos]]; } public boolean isOver() { return pos > vec.length - 1; } /** * 读取一个变量 */ public boolean readConst(Scan cs, int matchSkip) { if (cs.isOver()) { return false; } int val = read(); // while (true) { // val = read(); // if (val > 0 && charsTable[val] == ' ') { // continue; // } // break; // } boolean matched = false; int skip = 0; MATCH: while (cs.match(val)) { if (++skip == matchSkip) continue; mark(); int matchLen = 1; while (nextMatch(cs)) { matchLen++; } if (matchLen > 1) { // 匹配长度>1说明可能是常量 TODO 字符和数字类型的要求的长度更长些 matched = true; break MATCH; } } if (!matched) { mark = -1; } return matched; } boolean isMatchEnd(int pos) { if (pos > (vec.length - 1)) return true; char ch = charAt(pos); if (ch == ' ' || ch == '\t') return true; return false; } /** * 当前匹配的常量长度 * * @return */ int constLen() { return pos - mark; } String constStr() { StringBuilder buf = new StringBuilder(constLen()); for (int pos = this.mark; pos < this.pos; pos++) { buf.append(charsTable[vec[pos]]); } return buf.toString(); } String vecStr() { StringBuilder buf = new StringBuilder(vec.length); for (int pos = 0; pos < vec.length; pos++) { if (vec[pos] < 0) { buf.append("#var" + Math.abs(vec[pos]) + "#"); } else { buf.append(charsTable[vec[pos]]); } } return buf.toString(); } String var() { int varStart = 0; int varEnd = mark - 1; for (int pos = varEnd; pos > -1; pos--) { if (pos == marke || vec[pos] < 0) { // 变量的向量值都是小于0的 varStart = pos + 1; break; } } StringBuilder buf = new StringBuilder(varEnd - varStart + 1); for (int pos = varStart; pos <= varEnd; pos++) { buf.append(charsTable[vec[pos]]); } return buf.toString(); } /** * 设置向量表的变量值 * * @return 变量长度 */ int markVar() { if (mark > 0) { var--; int varStart = 0; int varEnd = mark - 1; for (int pos = varEnd; pos > -1; pos--) { if (pos == marke || this.vec[pos] < 0) { // 变量的向量值都是小于0的 varStart = pos + 1; break; } } varStart = skip(varStart, ' '); for (int pos = varStart; pos <= varEnd; pos++) { this.vec[pos] = var; } mark = -1; return varEnd - varStart + 1; } if (isOver() && marke > 0 && marke < this.vec.length) { // 结尾的变量 var--; for (int pos = marke + 1; pos < this.vec.length; pos++) { this.vec[pos] = var; } int len = vec.length - marke - 1; marke = -1; return len; } return 0; } private int skip(int pos, char ch) { while (pos < vec.length && charAt(pos) == ch) { pos++; } return pos; } void mark() { mark = pos - 1; } // public int nextMarke() { // while (!isOver() && charAt(pos++) == ' ') { // } // return pos; // } int read() { if (isOver()) return -1; return vec[pos++]; } // int skip() { // while (true) { // int val = vec[pos++]; // if (charsTable[val] != ' ') return val; // } // } boolean match(int val) { for (; pos < vec.length;) { if (vec[pos++] == val) { mark(); return true; } } return false; } boolean nextMatch(Scan cs) { if (cs.isOver() || this.isOver()) return false; if (isMatchEnd(pos)) return false; if (read() == cs.read()) { return true; } else { pos--; cs.pos--; return false; } } } } public class TestExporter { public List getTexts() { return texts; } public char[] getCharsTable() { return charsTable; } } }
网友评论