<li>
<a href="/hot/page/4/" rel="nofollow">
<!--<a href="/hot/page/4/" rel="nofollow">-->
<span class="next">
下一页
</span>
</a>
</li>
选取a的href标签;
response.xpath(‘//a/span[@class="next"]/parent::a/@href‘).extract()
python - How can I get the text with xPath between and
//div[@class=‘oc_info‘]/ul[@class=‘list‘]/following-sibling::text()
How to get html elements with multiple css classes - Stack Overflow
//div[contains(@class, ‘class1‘) and contains(@class, ‘class2‘)]
articles = selector.xpath(‘//ul[@class="article-list thumbnails"]/li‘)
for article in articles:
title = article.xpath(‘div/h4/a/text()‘).extract()
url = article.xpath(‘div/h4/a/@href‘).extract()
author = article.xpath(‘div/p/a/text()‘).extract()
#版本一
for r in response.xpath(‘//li[@class="clearfix"]‘):
#抓取标题
item[‘title‘] = r.xpath(‘./h3/a/text()‘).extract()
#抓取简述
item[‘desc‘] = r.xpath(‘string(./p/text())‘).extract()
#抓取时间
item[‘time‘] = r.xpath(‘./div/span/text()‘).extract()
版本二
from lxml import etree
html = etree.HTML(html)
li = html.xpath("//li[@class=‘clear‘]")
print(type(li))
print(len(li))
for item in li:
title = item.xpath("string(.//div[@class=‘title‘]/a/text())")
print(title)
#或者
from lxml import etree
text = ‘‘‘
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
‘‘‘
tmp_html = etree.HTML(text)
result = etree.tostring(tmp_html)
print(result)
li = tmp_html.xpath("//li")
len(li)
for i in li:
a = i.xpath(".//a//text()")
print(a)
原文:https://www.cnblogs.com/ministep/p/14589531.html