java simhash是什么?让我们一起来了解一下吧!
java simhash是java程序中的一种算法。Simhash算法产生与2002年,设计非常美妙,它输入是一个向量,得出的结果是一个F位的签名值。
Simhash和一般的hash算法不同,它具有两个关键的特点:
1.一个文档的指纹是所有属性的某种hash;
2.相似文档的hash应该是相似的;
simhash 算法如下:
1,将一个 f 维的向量 V 初始化为 0 ; f 位的二进制数 S 初始化为 0 ;
2,对每一个特征:用传统的 hash 算法对该特征产生一个 f 位的签名 b 。对 i=1 到 f :
如果b 的第 i 位为 1 ,则 V 的第 i 个元素加上该特征的权重;
否则,V 的第 i 个元素减去该特征的权重。
3,如果 V 的第 i 个元素大于 0 ,则 S 的第 i 位为 1 ,否则为 0 ;
4,输出 S 作为签名。
simhash 算法代码:
package com.xxxx.checkandbigdataquery.utils; import it.unimi.dsi.fastutil.longs.LongOpenHashSet; import it.unimi.dsi.fastutil.longs.LongSet; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.nio.CharBuffer; import java.util.Set; /** * a basic SimHash implementation * * */ public class SimHash { public static final int HASH_SIZE = 64; public static final long HASH_RANGE = 2 ^ HASH_SIZE; public static MurmurHash hasher = new MurmurHash(); /** * use short cuts to obtains a speed optimized simhash calculation * * @param s * input string * @return 64 bit simhash of input string */ private static final int FIXED_CGRAM_LENGTH = 4; public static long computeOptimizedSimHashForString(String s) { return computeOptimizedSimHashForString(CharBuffer.wrap(s)); } public static long computeOptimizedSimHashForString(CharBuffer s) { LongSet shingles = new LongOpenHashSet(Math.min(s.length(), 100000)); int length = s.length(); long timeStart = System.currentTimeMillis(); for (int i = 0; i < length - FIXED_CGRAM_LENGTH + 1; i++) { // extract an ngram long shingle = s.charAt(i); shingle <<= 16; shingle |= s.charAt(i + 1); shingle <<= 16; shingle |= s.charAt(i + 2); shingle <> 56); longAsBytes[1] = (byte) (shingle >> 48); longAsBytes[2] = (byte) (shingle >> 40); longAsBytes[3] = (byte) (shingle >> 32); longAsBytes[4] = (byte) (shingle >> 24); longAsBytes[5] = (byte) (shingle >> 16); longAsBytes[6] = (byte) (shingle >> 8); longAsBytes[7] = (byte) (shingle); long longHash = FPGenerator.std64.fp(longAsBytes, 0, 8); for (int i = 0; i < HASH_SIZE; ++i) { boolean bitSet = ((longHash >> i) & 1L) == 1L; v[i] += (bitSet) ? 1 : -1; } } long simhash = 0; for (int i = 0; i < HASH_SIZE; ++i) { if (v[i] > 0) { simhash |= (1L << i); } } return simhash; } public static long computeSimHashFromString(Set shingles) { int v[] = new int[HASH_SIZE]; // compute a set of shingles for (String shingle : shingles) { byte[] bytes = shingle.getBytes(); long longHash = FPGenerator.std64.fp(bytes, 0, bytes.length); // long hash1 = hasher.hash(bytes, bytes.length, 0); // long hash2 = hasher.hash(bytes, bytes.length, (int)hash1); // long longHash = (hash1 << 32) | hash2; for (int i = 0; i < HASH_SIZE; ++i) { boolean bitSet = ((longHash >> i) & 1L) == 1L; v[i] += (bitSet) ? 1 : -1; } } long simhash = 0; for (int i = 0; i < HASH_SIZE; ++i) { if (v[i] > 0) { simhash |= (1L << i); } } return simhash; } public static int hammingDistance(long hash1, long hash2) { long bits = hash1 ^ hash2; int count = 0; while (bits != 0) { bits &= bits - 1; ++count; } return count; } public static long rotate(long hashValue) { return (hashValue << 1) | (hashValue >>> -1); } public static void main(String[] args) { try { // File file1 = new File("/Users/rana/academia.edu_01.html"); // File file2 = new File("/Users/rana/academia.edu_02.html"); File file1 = new File(args[0]); File file2 = new File(args[1]); byte data1[] = new byte[(int) file1.length()]; byte data2[] = new byte[(int) file2.length()]; FileInputStream stream1 = new FileInputStream(file1); FileInputStream stream2 = new FileInputStream(file2); stream1.read(data1); stream2.read(data2); String string1 = new String(data1); String string2 = new String(data2); long timeStart = System.currentTimeMillis(); long simhash1 = computeSimHashFromString(Shingle.shingles(string1)); long timeEnd = System.currentTimeMillis(); System.out.println("Old Calc for Document A Took:" + (timeEnd - timeStart)); timeStart = System.currentTimeMillis(); long simhash2 = computeSimHashFromString(Shingle.shingles(string2)); timeEnd = System.currentTimeMillis(); System.out.println("Old Calc for Document B Took:" + (timeEnd - timeStart)); timeStart = System.currentTimeMillis(); long simhash3 = computeOptimizedSimHashForString(string1); timeEnd = System.currentTimeMillis(); System.out.println("New Calc for Document A Took:" + (timeEnd - timeStart)); timeStart = System.currentTimeMillis(); long simhash4 = computeOptimizedSimHashForString(string2); timeEnd = System.currentTimeMillis(); System.out.println("New Calc for Document B Took:" + (timeEnd - timeStart)); int hammingDistance = hammingDistance(simhash1, simhash2); int hammingDistance2 = hammingDistance(simhash3, simhash4); System.out.println("hammingdistance Doc (A) to Doc(B) OldWay:" + hammingDistance); System.out.println("hammingdistance Doc (A) to Doc(B) NewWay:" + hammingDistance2); } catch (IOException e) { e.printStackTrace(); } } }
以上就是小编今天的分享了,希望可以帮助到大家。