首页 > 编程语言 > 详细

python 爬虫demo

时间:2019-12-30 01:18:35      阅读:163      评论:0      收藏:0      [点我收藏+]
# coding: utf-8

import datetime
import urllib.parse
import urllib.request
from urllib.error import *
from bs4 import BeautifulSoup
import re
import os


def get_html(url, values):
    html = ‘‘
    status_code = 200
    user_agent = Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36
    headers = {User-Agent: user_agent}
    data = urllib.parse.urlencode(values).encode(encoding=UTF8)
    for i in range(1,3):
        req = urllib.request.Request(url=url, headers=headers, data=data)
        try:
            response = urllib.request.urlopen(req)
        except HTTPError as e:
            print(url, values)
            print(The server couldn\‘t fulfill the request.)
            print(HTTP Error,code: , e.code)
            status_code = int(e.code)
            break
        except URLError as e:
            status_code = int(e.code)
            print(We failed to reach a server.Reason: , e.reason)
            print(url: %s, status code:%d, retry count:%d % (url + ? + bytes.decode(data), status_code, i))
        else:
            html = response.read( ).decode(gbk)
            break

    return html, status_code


def request_page(page):
    url = http://cmispub.cicpa.org.cn/cicpa2_web/PersonIndexAction.do
    values = {
        method: indexQuery,
        queryType: 2,
        isStock: 00,
        pageSize: ‘‘,
        pageNum: page,
        offName: ‘‘,
        ascGuid: ‘‘,
        perCode: 0,
        perName: ‘‘
    }
    return get_html(url, values)


def parse_cicpa_page(html):
    soup = BeautifulSoup(html, html.parser)
    items = soup.select("#tabDetail a")
    return items


def request_detail(code):
    print(request code:, code)
    url = http://cmispub.cicpa.org.cn/cicpa2_web/07/ + code + .shtml
    values = {}
    return get_html(url, values)


def parse_detail_header(html):
    soup = BeautifulSoup(html, html.parser)
    headers = soup.select("#detailtb td.tdl")
    line = ‘‘
    for item in headers:
        line = line + item.get_text( ).strip( ) + ,
    line = line.strip(,)
    return line


def parse_detail_content(html):
    soup = BeautifulSoup(html, html.parser)
    headers = soup.select("#detailtb td.data_tb_content")
    line = ‘‘
    for item in headers:
        line = line + item.get_text( ).strip( ) + ,
    line = line.strip(,)
    return line


def create_file(filepath, header):
    file_dir = os.path.split(filepath)[0]
    if not os.path.isdir(file_dir):
        os.makedirs(file_dir)
    if not os.path.exists(filepath):
        f = open(filepath, w)
        if len(header) > 0:
            f.write(header + \n)
        f.close( )


def is_down_exists(code):
    return False


def main():
    start_time = datetime.datetime.now( )

    html_dir = D:/crawl_data/cicpa/html/
    if not os.path.isdir(html_dir):
        os.makedirs(html_dir)

    header_file = D:/crawl_data/cicpa/header.csv
    need_header = not os.path.exists(header_file)

    datafile = D:/crawl_data/cicpa/data_%s.csv % start_time.strftime("%Y%m%d_%H%M%S_%f")
    page_error_file = D:/crawl_data/cicpa/error_page_%s.txt % start_time.strftime("%Y%m%d_%H%M%S_%f")
    detail_error_file = D:/crawl_data/cicpa/error_detail_%s.txt % start_time.strftime("%Y%m%d_%H%M%S_%f")

    create_file(datafile, ‘‘)
    create_file(page_error_file, page,status)
    create_file(detail_error_file, code,status)

    data_file_object = open(datafile, +w)
    page_error_file_object = open(page_error_file, +w)
    detail_error_file_object = open(detail_error_file, +w)

    for i in range(1, 6912):
        print(request:, i)
        result, status = request_page(i)
        if status != 200:
            page_error_file_object.write(str(i) + , + str(status) + \n)
            page_error_file_object.flush( )
            continue
        items = parse_cicpa_page(result)
        for item in items:
            code = re.findall(r"javascript:viewDetail\(\‘(\w+?)\‘,", str(item))[0]
            html_file_path = html_dir + code + .html
            if os.path.exists(html_file_path):
                continue
            detail_html, status = request_detail(code)
            if len(detail_html) == 0:
                detail_error_file_object.write(code + ,%d\n % status)
                detail_error_file_object.flush( )
                continue
            if need_header:
                header = parse_detail_header(detail_html)
                f = open(header_file, w)
                f.write(header + \n)
                f.close()
                need_header = False
            # save base data
            line = parse_detail_content(detail_html)
            data_file_object.write(line + \n)
            data_file_object.flush( )
            # save html
            html_file_object = open(html_file_path, w)
            html_file_object.write(detail_html + \n)
            html_file_object.close( )
            print(line)

    data_file_object.close( )
    page_error_file_object.close( )
    detail_error_file_object.close( )
    print(finished in, (datetime.datetime.now( ) - start_time).microseconds, ms)

if __name__ == __main__:
    main( )

 

技术分享图片

 

python 爬虫demo

原文:https://www.cnblogs.com/zhaohz/p/12117167.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!