gistfile1.txt package similarity;import java.io.File;import java.io.IOException;import java.math.BigDecimal;import java.math.BigInteger;import java.util.HashMap;import java.util.StringTokenizer;/** * * simhash * @author tzj * */public class
package similarity; import java.io.File; import java.io.IOException; import java.math.BigDecimal; import java.math.BigInteger; import java.util.HashMap; import java.util.StringTokenizer; /** * * simhash * @author tzj * */ public class SimHash { private String tokens; private BigInteger intSimHash; private String strSimHash; private int hashbits = 64; public SimHash(String tokens) { this.tokens = tokens; this.intSimHash = this.simHash(); } public SimHash(String tokens, int hashbits) { this.tokens = tokens; this.hashbits = hashbits; this.intSimHash = this.simHash(); } HashMapwordMap = new HashMap<>(); /** * 获取simhash值 * * @return */ public BigInteger simHash() { // 定义特征向量/数组 int[] v = new int[this.hashbits]; // 1、将文本去掉格式后, 分词. StringTokenizer stringTokens = new StringTokenizer(this.tokens); while (stringTokens.hasMoreTokens()) { String temp = stringTokens.nextToken(); // 2、将每一个分词hash为一组固定长度的数列.比如 64bit 的一个整数. long t = hash64(temp); for (int i = 0; i < this.hashbits; i++) { BigInteger bitmask = new BigInteger("1").shiftLeft(i); // 3、建立一个长度为64的整数数组(假设要生成64位的数字指纹,也可以是其它数字), // 对每一个分词hash后的数列进行判断,如果是1000...1,那么数组的第一位和末尾一位加1, // 中间的62位减一,也就是说,逢1加1,逢0减1.一直到把所有的分词hash数列全部判断完毕. if ((bitmask.longValue() & t) != 0) { // 这里是计算整个文档的所有特征的向量和 // 这里实际使用中需要 +- 权重,而不是简单的 +1/-1, v[i] += 1; } else { v[i] -= 1; } } } BigInteger fingerprint = new BigInteger("0"); StringBuffer simHashBuffer = new StringBuffer(); for (int i = 0; i < this.hashbits; i++) { // 4、最后对数组进行判断,大于0的记为1,小于等于0的记为0,得到一个 64bit 的数字指纹/签名. if (v[i] >= 0) { fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i)); simHashBuffer.append("1"); } else { simHashBuffer.append("0"); } } this.strSimHash = simHashBuffer.toString(); return fingerprint; } public static long hash64(String string) { long h = 1125899906842597L; // prime int len = string.length(); for (int i = 0; i < len; i++) { h = 31 * h + string.charAt(i); } return h; } public static BigDecimal yxxsd(long l1){ long mask=15; BigDecimal bigDecimal=new BigDecimal(15*8); long tmpsum=0; for(int i=0;i<8;i++){ tmpsum+=(l1&(mask<<(8*i)))>>(8*i); } return new BigDecimal(15*8-tmpsum).divide(bigDecimal,2, BigDecimal.ROUND_HALF_EVEN); } public static void main(String[] args) throws IOException { String s1 = LuaTest.f2s(new File("a2.txt")); String s2 = LuaTest.f2s(new File("a.txt")); SimHash strSimHash1 = new SimHash(s1); SimHash strSimHash2 = new SimHash(s2); System.out.println(strSimHash1.strSimHash); System.out.println(strSimHash2.strSimHash); System.out.println(strSimHash1.intSimHash + "\t" + strSimHash2.intSimHash); for(int i=0;i<10000000;i++){ yxxsd(strSimHash1.intSimHash.xor(strSimHash2.intSimHash).longValue()); } System.out.println("ok"); // long l1=13942018546432636127l; // // long l2=13869960952394708191l; } }