当前位置 : 主页 > 编程语言 > c++ >

simhash 算法检测相似度

来源:互联网 收集:自由互联 发布时间:2021-07-03
gistfile1.txt package similarity;import java.io.File;import java.io.IOException;import java.math.BigDecimal;import java.math.BigInteger;import java.util.HashMap;import java.util.StringTokenizer;/** * * simhash * @author tzj * */public class
gistfile1.txt
package similarity;

import java.io.File;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.util.HashMap;
import java.util.StringTokenizer;

/**
 * 
 * simhash 
 * @author tzj
 *
 */
public class SimHash {

	private String tokens;

	private BigInteger intSimHash;

	private String strSimHash;

	private int hashbits = 64;

	public SimHash(String tokens) {
		this.tokens = tokens;
		this.intSimHash = this.simHash();
	}

	public SimHash(String tokens, int hashbits) {
		this.tokens = tokens;
		this.hashbits = hashbits;
		this.intSimHash = this.simHash();
	}

	HashMap
 
   wordMap = new HashMap<>();

	/**
	 * 获取simhash值
	 * 
	 * @return
	 */
	public BigInteger simHash() {
		// 定义特征向量/数组
		int[] v = new int[this.hashbits];
		// 1、将文本去掉格式后, 分词.
		StringTokenizer stringTokens = new StringTokenizer(this.tokens);
		while (stringTokens.hasMoreTokens()) {
			String temp = stringTokens.nextToken();
			// 2、将每一个分词hash为一组固定长度的数列.比如 64bit 的一个整数.
			long t = hash64(temp);
			for (int i = 0; i < this.hashbits; i++) {
				BigInteger bitmask = new BigInteger("1").shiftLeft(i);
				// 3、建立一个长度为64的整数数组(假设要生成64位的数字指纹,也可以是其它数字),
				// 对每一个分词hash后的数列进行判断,如果是1000...1,那么数组的第一位和末尾一位加1,
				// 中间的62位减一,也就是说,逢1加1,逢0减1.一直到把所有的分词hash数列全部判断完毕.
				if ((bitmask.longValue() & t) != 0) {
					// 这里是计算整个文档的所有特征的向量和
					// 这里实际使用中需要 +- 权重,而不是简单的 +1/-1,
					v[i] += 1;
				} else {
					v[i] -= 1;
				}
			}
		}
		BigInteger fingerprint = new BigInteger("0");
		StringBuffer simHashBuffer = new StringBuffer();
		for (int i = 0; i < this.hashbits; i++) {
			// 4、最后对数组进行判断,大于0的记为1,小于等于0的记为0,得到一个 64bit 的数字指纹/签名.
			if (v[i] >= 0) {
				fingerprint = fingerprint.add(new BigInteger("1").shiftLeft(i));
				simHashBuffer.append("1");
			} else {
				simHashBuffer.append("0");
			}
		}
		this.strSimHash = simHashBuffer.toString();
		return fingerprint;
	}

	public static long hash64(String string) {
		long h = 1125899906842597L; // prime
		int len = string.length();

		for (int i = 0; i < len; i++) {
			h = 31 * h + string.charAt(i);
		}
		return h;
	}
	
	public static BigDecimal yxxsd(long l1){
		
		long mask=15;
		
		
		
		BigDecimal bigDecimal=new BigDecimal(15*8);
		
		
		long tmpsum=0;
		for(int i=0;i<8;i++){
			tmpsum+=(l1&(mask<<(8*i)))>>(8*i);
			
		}
		

		return new BigDecimal(15*8-tmpsum).divide(bigDecimal,2, BigDecimal.ROUND_HALF_EVEN);
	}
	

	public static void main(String[] args) throws IOException {

		String s1 = LuaTest.f2s(new File("a2.txt"));

		String s2 = LuaTest.f2s(new File("a.txt"));

		SimHash strSimHash1 = new SimHash(s1);
		SimHash strSimHash2 = new SimHash(s2);
		System.out.println(strSimHash1.strSimHash);

		System.out.println(strSimHash2.strSimHash);

		System.out.println(strSimHash1.intSimHash + "\t" + strSimHash2.intSimHash);
		
	
		
		for(int i=0;i<10000000;i++){
			yxxsd(strSimHash1.intSimHash.xor(strSimHash2.intSimHash).longValue());
		}
		System.out.println("ok");
		
		
//		long l1=13942018546432636127l;
//		
//		long l2=13869960952394708191l;
	}
}
 
网友评论