gistfile1.txt package similarity;import java.io.File;import java.io.IOException;import java.math.BigDecimal;import java.math.BigInteger;import java.util.HashMap;import java.util.StringTokenizer;/** * * simhash * @author tzj * */public class
package similarity;
import java.io.File;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.HashMap;
import java.util.StringTokenizer;
/**
*
* simhash
* @author tzj
*
*/
public class SimHash {
private String tokens;
private BigInteger intSimHash;
private String strSimHash;
private int hashbits = 64;
public SimHash(String tokens) {
this.tokens = tokens;
this.intSimHash = this.simHash();
}
public SimHash(String tokens, int hashbits) {
this.tokens = tokens;
this.hashbits = hashbits;
this.intSimHash = this.simHash();
}
HashMap
wordMap = new HashMap<>();
/**
* 获取simhash值
*
* @return
*/
public BigInteger simHash() {
// 定义特征向量/数组
int[] v = new int[this.hashbits];
// 1、将文本去掉格式后, 分词.
StringTokenizer stringTokens = new StringTokenizer(this.tokens);
while (stringTokens.hasMoreTokens()) {
String temp = stringTokens.nextToken();
// 2、将每一个分词hash为一组固定长度的数列.比如 64bit 的一个整数.
long t = hash64(temp);
for (int i = 0; i < this.hashbits; i++) {
BigInteger bitmask = new BigInteger("1").shiftLeft(i);
// 3、建立一个长度为64的整数数组(假设要生成64位的数字指纹,也可以是其它数字),
// 对每一个分词hash后的数列进行判断,如果是1000...1,那么数组的第一位和末尾一位加1,
// 中间的62位减一,也就是说,逢1加1,逢0减1.一直到把所有的分词hash数列全部判断完毕.
if ((bitmask.longValue() & t) != 0) {
// 这里是计算整个文档的所有特征的向量和
// 这里实际使用中需要 +- 权重,而不是简单的 +1/-1,
v[i] += 1;
} else {
v[i] -= 1;
}
}
}
BigInteger fingerprint = new BigInteger("0");
StringBuffer simHashBuffer = new StringBuffer();
for (int i = 0; i < this.hashbits; i++) {
// 4、最后对数组进行判断,大于0的记为1,小于等于0的记为0,得到一个 64bit 的数字指纹/签名.
if (v[i] >= 0) {
fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
simHashBuffer.append("1");
} else {
simHashBuffer.append("0");
}
}
this.strSimHash = simHashBuffer.toString();
return fingerprint;
}
public static long hash64(String string) {
long h = 1125899906842597L; // prime
int len = string.length();
for (int i = 0; i < len; i++) {
h = 31 * h + string.charAt(i);
}
return h;
}
public static BigDecimal yxxsd(long l1){
long mask=15;
BigDecimal bigDecimal=new BigDecimal(15*8);
long tmpsum=0;
for(int i=0;i<8;i++){
tmpsum+=(l1&(mask<<(8*i)))>>(8*i);
}
return new BigDecimal(15*8-tmpsum).divide(bigDecimal,2, BigDecimal.ROUND_HALF_EVEN);
}
public static void main(String[] args) throws IOException {
String s1 = LuaTest.f2s(new File("a2.txt"));
String s2 = LuaTest.f2s(new File("a.txt"));
SimHash strSimHash1 = new SimHash(s1);
SimHash strSimHash2 = new SimHash(s2);
System.out.println(strSimHash1.strSimHash);
System.out.println(strSimHash2.strSimHash);
System.out.println(strSimHash1.intSimHash + "\t" + strSimHash2.intSimHash);
for(int i=0;i<10000000;i++){
yxxsd(strSimHash1.intSimHash.xor(strSimHash2.intSimHash).longValue());
}
System.out.println("ok");
// long l1=13942018546432636127l;
//
// long l2=13869960952394708191l;
}
}
