#xpath是在xml文档中搜索内容的一门语言 #html是xml的子集 xml = """ book id1/id name野花遍地⾹/name price1.23/price nick臭⾖腐/nick author nick id="10086"周⼤强/nick nick id="10010"周芷若/nick nick class="joy"周杰
#html是xml的子集
xml = """
<book>
<id>1</id>
<name>野花遍地⾹</name>
<price>1.23</price>
<nick>臭⾖腐</nick>
<author>
<nick id="10086">周⼤强</nick>
<nick id="10010">周芷若</nick>
<nick class="joy">周杰伦</nick>
<nick class="jolin">蔡依林</nick>
<div>
<nick>惹了</nick>
</div>
</author>
<partner>
<nick id="ppc">胖胖陈</nick>
<nick id="ppbc">胖胖不陈</nick>
</partner>
</book>
"""
from lxml import etree
tree=etree.XML(xml)
#result=tree.xpath("/book/name") #/表示层级关系,第一个/是根节点
#result=tree.xpath("/book/name/text()")
result=tree.xpath("/book/*//nick/text()")#双斜杠表示找后代,*是通配符什么节点都可以
#parse是加载文件
print(result)#xpath是在xml文档中搜索内容的一门语言
#html是xml的子集
xml = """
<book>
<id>1</id>
<name>野花遍地⾹</name>
<price>1.23</price>
<nick>臭⾖腐</nick>
<author>
<nick id="10086">周⼤强</nick>
<nick id="10010">周芷若</nick>
<nick class="joy">周杰伦</nick>
<nick class="jolin">蔡依林</nick>
<div>
<nick>惹了</nick>
</div>
</author>
<partner>
<nick id="ppc">胖胖陈</nick>
<nick id="ppbc">胖胖不陈</nick>
</partner>
</book>
"""
from lxml import etree
# tree=etree.XML(xml)
# #result=tree.xpath("/book/name") #/表示层级关系,第一个/是根节点
# #result=tree.xpath("/book/name/text()")
# result=tree.xpath("/book/*//nick/text()")#双斜杠表示找后代,*是通配符什么节点都可以
# #parse是加载文件
# print(result)
tree=etree.parse("b.html",etree.HTMLParser())
# result=tree.xpath("/html/body/ol/li/a[@href='dapao']/text()")
ol_list=tree.xpath("/html/body/ul/li")
for l in ol_list:
#从每一个li提取到文字信息
#res=l.xpath("./a/text()")#继续查找
res=l.xpath("./a/@href")
print(res)
res1=tree.xpath("/html/body/div[1]/text()")
print(res1)
上面是xpath的语法,目前来说三种爬虫,xpath应该是最简单的,正则是最万能的,xpath主要对于界面元素可以直接通过复制xpath路径直接获取不需要自己分析源代码