从爬取的文章 HTML 中提取出中文关键字
分2步。 1.从 HTML 中提取出纯文本(去掉标签) import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.beans.StringBean; import org.htmlparser.filters.CssSelectorNodeFilter; import org.htmlparser.util.NodeList; public class HtmlUtil { public static String getText(String html, String id) { try { Parser parser = new Parser(html); NodeFilter filter = new CssSelectorNodeFilter("#" + id); NodeList nList = parser.extractAllNodesThatMatch(filter); return nList == null || nList.size() == 0 ? ...