gistfile1.txt /** * 从一段HTML中萃取纯文本 * @param html * @return */public static String getPlainText(String html){ if(StringUtils.isBlank(html)) return ""; Element ebody = Jsoup.parseBodyFragment(html).body(); ebody.select("code"
/** * 从一段HTML中萃取纯文本 * @param html * @return */ public static String getPlainText(String html){ if(StringUtils.isBlank(html)) return ""; Element ebody = Jsoup.parseBodyFragment(html).body(); ebody.select("code").remove(); ebody.select("pre").remove(); ebody.select("img").remove(); return ebody.text(); } /** * 计算两篇内容的相似度 * @param html1 * @param html2 * @return * @throws IOException */ public static double similarity(String html1, String html2) throws IOException { String text1 = getPlainText(html1); String text2 = getPlainText(html2); Content ct1 = new Content(tokenizer(text1)); Content ct2 = new Content(tokenizer(text2)); return ContentComparator.compareStatic(ct1, ct2); }