爬虫就是沿着一定的路径,模拟人工的行为,自动、高效地浏览互联网操作,从网站、应用程序等终端呈现的平台上去提取所需要的数据。 jsoup是一款Java的HTML解析器,可直接解析某个
爬虫就是沿着一定的路径,模拟人工的行为,自动、高效地浏览互联网操作,从网站、应用程序等终端呈现的平台上去提取所需要的数据。
jsoup是一款Java的HTML解析器,可直接解析某个URL地址、HTML文本内容。它提供了一套非常省力的API,可通过DOM,CSS以及类似于jQuery的操作方法来取出和操作数据。
通过Document选择器常用的获取Elements方法
select(String cssQuery) //查找与SelectorCSS查询匹配的元素,类似于 JQuery 的选择器。getElementById(String id) //通过ID查找元素
getElementsByClass(String className) //通过class查找元素
getElementsByTag(String tagName) //通过指定名称查找元素,并递归地查找这些元素。
getElementsByAttributeStarting(String keyPrefix) //查找具有以提供的前缀开头的属性名称的元素
getElementsByAttributeValue(String key, String value) //查找具有具有特定值的属性的元素
getElementsByAttributeValueContaining(String key, String match) //查找具有其值包含匹配字符串的属性的元素
getElementsByAttributeValueStarting(String key, String valuePrefix) //查找具有以值前缀开头的属性的元素
getElementsByAttributeValueEnding(String key, String valueSuffix) //查找具有以值后缀结尾的属性的元素
getElementsContainingText(String searchText) //查找包含指定字符串的元素
getAllElements() //在此元素下找到所有元素。
firstElementSibling() //获取此元素的第一个元素同级
获得Elements元素的内容
attr(String key) //获得元素的数据text() //得到文本值
html() //获取html
outerHtml() //获得内部html
导入jsoup的jar包
<dependency><groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.14.3</version>
</dependency>
写个爬取影片详情demo
private static final String[] USER_AGENT = {"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36","Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"};
/**
* 通过影片名称;检索影片信息
*
* @param url
* @return
*/
public List<MovieInfo> getSearchInfoList(String url) {
List<MovieInfo> resultList = new ArrayList<>();
if (StringUtils.isEmpty(url)) {
return null;
}
//获取页面对象
Document document = getDocument(url);
if(document == null)return null;
//解析HTML对象文件
Elements aEls = document.select("div.result-list").select("div.title");
for (Element el : aEls) {
String href = el.select("a").attr("href");
MovieInfo vedioInfo = getVedioInfo(href);
if (vedioInfo != null) {
vedioInfo.setType(el.select("span").get(0).html().replaceAll("\\[|\\]",""));
resultList.add(vedioInfo);
}
}
return resultList;
}
/**
* 影片详情数据解析
*
* @param url
*/
public MovieInfo getVedioInfo(String url) {
if (StringUtils.isEmpty(url)) {
return null;
}
Document doc = getDocument(url);
url = doc.baseUri();
MovieInfo movieInfo = new MovieInfo();
movieInfo.setPageUrl(url);
//从地址中提取id
String moveId = getPattern("([1-9]\\d*)", url, 0);
if (StringUtils.isNotEmpty(moveId)) {
movieInfo.setMoveId(moveId);
}
String title = doc.select("span[property=\"v:itemreviewed\"]").get(0).html();
movieInfo.setTitle(title);
String year = doc.select("span[class=\"year\"]").get(0).html();
movieInfo.setYear(year.replaceAll("\\(|\\)",""));
Elements subject = doc.select("div#info");
//导演
Elements directs = subject.select("a[rel=\"v:directedBy\"]");
String directName = directs.stream().map(o -> o.html().trim()).collect(Collectors.joining(";"));
if (StringUtils.isNotEmpty(directName)) {
movieInfo.setDirector(String.format(";%s;", directName.trim()));
}
Elements plElement = subject.select("span[class=\"pl\"]");
//编剧
Optional<Element> adaptorsOpt = plElement.stream().filter(o -> o.html().equals("编剧")).findFirst();
if (adaptorsOpt.isPresent()) {
Element adaptorEl = adaptorsOpt.get();
Elements adaptors = adaptorEl.nextElementSibling().select("a");
String adaptorName = adaptors.stream().map(o -> o.html().trim()).collect(Collectors.joining(";"));
if (StringUtils.isNotEmpty(adaptorName)) {
movieInfo.setAdaptorName(String.format(";%s;", adaptorName.trim()));
}
}
//主演
Optional<Element> leadOpt = plElement.stream().filter(o -> o.html().equals("主演")).findFirst();
if (leadOpt.isPresent()) {
Element leaderEl = leadOpt.get();
Elements leaders = leaderEl.nextElementSibling().select("a[rel=\"v:starring\"]");
String leaderName = leaders.stream().map(o -> o.html().trim()).collect(Collectors.joining(";"));
if (StringUtils.isNotEmpty(leaderName)) {
movieInfo.setActors(String.format(";%s;", leaderName.trim()));
}
}
//标签
Elements kinds = subject.select("span[property=\"v:genre\"]");
String kindName = kinds.stream().map(o -> o.html().trim()).collect(Collectors.joining(";"));
if (StringUtils.isNotEmpty(kindName)) {
movieInfo.setTags(String.format(";%s;", kindName.trim()));
}
//制片国家/地区
Optional<Element> areaOpt = plElement.stream().filter(o -> o.html().equals("制片国家/地区:")).findFirst();
if (areaOpt.isPresent()) {
Element areaEl = areaOpt.get();
String areaName = areaEl.nextSibling().outerHtml();
if (StringUtils.isNotEmpty(areaName)) {
movieInfo.setAreaChar(String.format(";%s;", areaName.trim()));
}
}
//语言
Optional<Element> languageOpt = plElement.stream().filter(o -> o.html().equals("语言:")).findFirst();
if (languageOpt.isPresent()) {
Element languageEl = languageOpt.get();
String languageName = languageEl.nextSibling().outerHtml();
if (StringUtils.isNotEmpty(languageName)) {
movieInfo.setLanguage(String.format(";%s;", languageName.trim()));
}
}
//上应时间
Elements releaseTimeEl = subject.select("span[property=\"v:initialReleaseDate\"]");
String releaseStr = releaseTimeEl.html();
if (StringUtils.isNotEmpty(releaseStr)) {
//提取 年月日
String patten = "([0-9]{3}[1-9]|[0-9]{2}[1-9][0-9]{1}|[0-9]{1}[1-9][0-9]{2}|[1-9][0-9]{3})-(((0[13578]|1[02])-(0[1-9]|[12][0-9]|3[01]))|((0[469]|11)-(0[1-9]|[12][0-9]|30))|(02-(0[1-9]|[1][0-9]|2[0-8])))";
String s = getPattern(patten, releaseStr, 0);
if (StringUtils.isNotEmpty(s)) {
movieInfo.setYear(s);
}
}
//集数
Optional<Element> totalNumberOpt = plElement.stream().filter(o -> o.html().equals("集数:")).findFirst();
if (totalNumberOpt.isPresent()) {
Element totalNumberEl = totalNumberOpt.get();
String totalNumber = totalNumberEl.nextSibling().outerHtml();
if (StringUtils.isNotEmpty(totalNumber)) {
movieInfo.setTotalNumber(Integer.parseInt(totalNumber.trim()));
}
}
//别名
Optional<Element> otherNameOpt = plElement.stream().filter(o -> o.html().equals("又名:")).findFirst();
if (otherNameOpt.isPresent()) {
Element otherNameEl = otherNameOpt.get();
String otherName = otherNameEl.nextSibling().outerHtml();
if (StringUtils.isNotEmpty(otherName)) {
movieInfo.setOtherName(Arrays.stream(otherName.split("/")).distinct().map(i->i.trim()).collect(Collectors.joining(";")));
}
}
//豆瓣评分
Elements scores = doc.select("div#interest_sectl").select("strong[property=\"v:average\"]");
String scoreStr = scores.html();
if (StringUtils.isNotEmpty(scoreStr)) {
movieInfo.setScores(new BigDecimal(scoreStr));
}
//简介
Elements storyEl = doc.select("span[property=\"v:summary\"]");
String story = storyEl.html();
if (StringUtils.isNotEmpty(story)) {
movieInfo.setStory(story.trim());
}
//海报
String imgSrc = doc.select("div#mainpic").select("img").attr("src");
movieInfo.setPoster(imgSrc);
return movieInfo;
}
/**
* 获得页面对象包装
*
* @param url
* @return
* @throws IOException
*/
private Document getDocument(String url) {
Document doc = null;
try {
doc = Jsoup.connect(url)
// .proxy("",00)
// .data("","请求参数")
// .cookie("auth", "token") //设置cookie
// .post(); //使用POST方法访问URL
.userAgent(USER_AGENT[0])//设置User-Agent
.timeout(15000)//设置连接超时时间
.get();
} catch (IOException e) {
e.printStackTrace();
System.out.println("爬取失败,url=" + url);
}
return doc;
}
public String getPattern(String regex, String content, int groupIndex) {
Matcher matcher = Pattern.compile(regex).matcher(content);
if (matcher.find()) {
return matcher.group(groupIndex);
}
return null;
}
/**
* 通过影片名称;检索影片信息,获取接口返回的json信息解析
*
* @param page
* @return
*/
public List<MovieInfo> getInfoListJson(Integer page) {
List<MovieInfo> resultList = new ArrayList<>();
String url = "https:xxx/list?channel_id=1&page_id=%s&ret_num=48";
String searchUrl = String.format(url, page);
//获取url返回数据
Document document = getDocument(searchUrl);
if(document == null)return null;
net.sf.json.JSONObject.fromObject(document.text()).getJSONObject("data")
.getJSONArray("list").forEach(j->{
net.sf.json.JSONObject json = net.sf.json.JSONObject.fromObject(j);
MovieInfo movieInfo = new MovieInfo();
movieInfo.setMoveId(json.getString("tvId"));
movieInfo.setTitle(json.getString("name"));
movieInfo.setScores(new BigDecimal(json.getString("score")));
movieInfo.setYear(json.getString("period"));
movieInfo.setPoster(json.getString("imageUrl"));
movieInfo.setDirector(json.getString("description"));
resultList.add(movieInfo);
});
return resultList;
}
public static void main(String[] args) {
String search = "https://www.xxx.com/search?cat=1002&q=%s";
String url = String.format(search, "明日战记");
List<MovieInfo> list = getSearchInfoList(url);
System.out.println(list);
List<MovieInfo> list = getInfoListJson(1);
System.out.println(list);
}