首页 > 编程语言 > 详细

Python 爬虫

时间:2020-10-25 22:42:36      阅读:27      评论:0      收藏:0      [点我收藏+]

1. 配置第三方包

# 时间的模块
import datetime
# 数据分析模块,用来处理excel
import pandas as pd
#用来构造xlsx文件的模块
import xlsxwriter as xlw
# 用来爬取数据的模块
from urllib import request
# HTML或XML标签中的内容解析器
from bs4 import BeautifulSoup as bs

2.获取时间序列函数

# 产生时间序列
def dateRange1(start, end):
    datelist1 = [datetime.datetime.strftime(x, %Y%m) for x in list(
        pd.date_range(start=start, end=end))]
    datelist = sorted(list(set(datelist1)))
    return datelist
# [‘202005‘, ‘202006‘, ‘202007‘, ‘202008‘, ‘202009‘, ‘202010‘]

3.爬取网页

# 爬取网页数据,解析HTML文件,筛选数据,转换成列表格式数据
def getCommentsById(city, start, end): 
    weather_result = [] 
    # 获取时间序列 
    datelist = dateRange1(start, end)  # [ ‘202009‘, ‘202010‘]
    for i in datelist:
        url = http://lishi.tianqi.com/ + city + / + i + .html
        # 请求天气数据
        opener = request.Request(url)
        # 添加  HTTP请求头
        opener.add_header(
            User-Agent, Mozilla/4.0 (compatible; MSIE 5.5; Windows NT))
        req = request.urlopen(opener).read()
        # 解析html 数据
        soup = bs(req, html.parser)
        
        # ‘div .thrui > li ‘ 筛选html数据 
        weather_m = soup.select(div .thrui > li )
        # 循环获取的数据  
        for i in weather_m[0:]:  
            tt = []
            for j in range(5):
               t = i.find_all(div)[j].string
               if t is not None:  # 存在None值的进行处理,否则不能写入到excel
                    tt.append(t)
               else:
                    tt.append(None)
            weather_result.append(tt)
            print(weather_result)
    return weather_result

4.输出excel文件

#  将list数据写入到本地excel中
def list_to_excel(weather_result, filename):
    # 创建excel 名称,路径
    workbook = xlw.Workbook(E:\\%s.xlsx % filename)
    # 添加工作簿
    sheet = workbook.add_worksheet(weather_report)
    # 添加excel头标题文字
    title = [日期, 最高气温, 最低气温, 天气, 风向,]
    for i in range(len(title)):
        # 将标题文字写入excel表头,字体加粗
        sheet.write_string(0, i, title[i], workbook.add_format({bold: True}))  
    row, col = 1, 0
    for a, b, c, d , e in weather_result:
        # 依次将数据 写入表格
        sheet.write_string(row, col, a)
        sheet.write_string(row, col + 1, b)
        sheet.write_string(row, col + 2, c)
        sheet.write_string(row, col + 3, d)
        sheet.write_string(row, col + 4, d)
        row += 1
        # 关闭表格
    workbook.close()

5.调用

# 你要查询的城市的名称(拼音),起始时间,结束时间。
data = getCommentsById(hunan, 2020-09, 2020-10)

# 获取的data值,excel的文件名
list_to_excel(data, 湖南天气202009-202010)

 

全部源码

# 时间的模块
import datetime
# 数据分析模块,用来处理excel
import pandas as pd
#用来构造xlsx文件的模块
import xlsxwriter as xlw
# 用来爬取数据的模块
from urllib import request
# HTML或XML标签中的内容解析器
from bs4 import BeautifulSoup as bs

# 产生时间序列
def dateRange1(start, end):
    datelist1 = [datetime.datetime.strftime(x, %Y%m) for x in list(
        pd.date_range(start=start, end=end))]
    datelist = sorted(list(set(datelist1)))
    return datelist
# [‘202005‘, ‘202006‘, ‘202007‘, ‘202008‘, ‘202009‘, ‘202010‘]


# 爬取网页数据,解析HTML文件,筛选数据,转换成列表格式数据
def getCommentsById(city, start, end): 
    weather_result = [] 
    # 获取时间序列 
    datelist = dateRange1(start, end)  # [ ‘202009‘, ‘202010‘]
    for i in datelist:
        url = http://lishi.tianqi.com/ + city + / + i + .html
        # 请求天气数据
        opener = request.Request(url)
        # 添加  HTTP请求头
        opener.add_header(
            User-Agent, Mozilla/4.0 (compatible; MSIE 5.5; Windows NT))
        req = request.urlopen(opener).read()
        # 解析html 数据
        soup = bs(req, html.parser)
        
        # ‘div .thrui > li ‘ 筛选html数据 
        weather_m = soup.select(div .thrui > li )
        # 循环获取的数据  
        for i in weather_m[0:]:  
            tt = []
            for j in range(5):
               t = i.find_all(div)[j].string
               if t is not None:  # 存在None值的进行处理,否则不能写入到excel
                    tt.append(t)
               else:
                    tt.append(None)
            weather_result.append(tt)
            print(weather_result)
    return weather_result

#  将list数据写入到本地excel中
def list_to_excel(weather_result, filename):
    # 创建excel 名称,路径
    workbook = xlw.Workbook(E:\\%s.xlsx % filename)
    # 添加工作簿
    sheet = workbook.add_worksheet(weather_report)
    # 添加excel头标题文字
    title = [日期, 最高气温, 最低气温, 天气, 风向,]
    for i in range(len(title)):
        # 将标题文字写入excel表头,字体加粗
        sheet.write_string(0, i, title[i], workbook.add_format({bold: True}))  
    row, col = 1, 0
    for a, b, c, d , e in weather_result:
        # 依次将数据 写入表格
        sheet.write_string(row, col, a)
        sheet.write_string(row, col + 1, b)
        sheet.write_string(row, col + 2, c)
        sheet.write_string(row, col + 3, d)
        sheet.write_string(row, col + 4, d)
        row += 1
        # 关闭表格
    workbook.close()




# 你要查询的城市的名称(拼音),起始时间,结束时间。
data = getCommentsById(hunan, 2020-09, 2020-10)

# 获取的data值,excel的文件名
list_to_excel(data, 湖南天气202009-202010)

 

Python 爬虫

原文:https://www.cnblogs.com/Rivend/p/13875439.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!