gistfile1.txt /** * 从一段HTML中萃取纯文本 * @param html * @return */public static String getPlainText(String html){ if(StringUtils.isBlank(html)) return ""; Element ebody = Jsoup.parseBodyFragment(html).body(); ebody.select("code"
/**
* 从一段HTML中萃取纯文本
* @param html
* @return
*/
public static String getPlainText(String html){
if(StringUtils.isBlank(html)) return "";
Element ebody = Jsoup.parseBodyFragment(html).body();
ebody.select("code").remove();
ebody.select("pre").remove();
ebody.select("img").remove();
return ebody.text();
}
/**
* 计算两篇内容的相似度
* @param html1
* @param html2
* @return
* @throws IOException
*/
public static double similarity(String html1, String html2) throws IOException {
String text1 = getPlainText(html1);
String text2 = getPlainText(html2);
Content ct1 = new Content(tokenizer(text1));
Content ct2 = new Content(tokenizer(text2));
return ContentComparator.compareStatic(ct1, ct2);
}
