安装python环境参考菜鸟教程:
传送门:https://www.runoob.com/w3cnote/python-pip-install-usage.html
1..简单爬取网页数据并输出
import requests
from lxml import etree
import xlwt
# 获取源码
html = requests.get("https://www.ghpym.com/category/videos")
# 打印源码
#print (html.text)
etree_html = etree.HTML(html.text) #将源码转化为能被 XPath 匹配的格式
#
#//*[@id="wrap"]/div/div/div/ul/li[1]/div[2]/h2/a/text()
content = etree_html.xpath(‘//*[@id="wrap"]/div/div/div/ul/li/div[2]/h2/a/@href‘)
for each in content:
replace = each.replace(‘\n‘,‘‘).replace(‘ ‘,‘‘) #去掉换行符和空格
if replace ==‘\n‘ or replace == "":
continue
else:
print (replace)
content = etree_html.xpath(‘//*[@id="wrap"]/div/div/div/ul/li/div[2]/h2/a/text()‘)
for each in content:
replace = each.replace(‘\n‘,‘‘).replace(‘ ‘,‘‘)
if replace ==‘\n‘ or replace == "":
continue
else:
print (replace)
print("完成")
2.爬取数据打印到xls表格中
# coding:utf-8
from lxml import etree
import requests
import xlwt
title=[]
def get_film_name(url):
html = requests.get(url).text #这里一般先打印一下 html 内容,看看是否有内容再继续。
#print(html)
s=etree.HTML(html) #将源码转化为能被 XPath 匹配的格式
filename =s.xpath(‘//*[@id="wrap"]/div/div/div/ul/li/div[2]/h2/a/@href‘) #返回为一列表
print (filename)
title.extend(filename)
def get_all_film_name():
for i in range(0, 250, 25):
url = ‘https://www.ghpym.com/category/videos‘.format(i)
get_film_name(url)
if ‘_main_‘:
myxls=xlwt.Workbook()
sheet1=myxls.add_sheet(u‘top250‘,cell_overwrite_ok=True)
get_all_film_name()
for i in range(0,len(title)):
sheet1.write(i,0,i+1)
sheet1.write(i,1,title[i])
myxls.save(‘top250.xls‘)
print("完成")
简单爬虫操作:1.简单爬取网页数据并输出 2.爬取数据打印到xls表格中
原文:https://www.cnblogs.com/jessezs/p/12584505.html