首页 > 其他 > 详细

爬取5家公司(如:阿里巴巴、京东、亚马逊、华为、贵州茅台)百度“资讯”新闻的10页内容

时间:2019-10-22 22:00:30      阅读:158      评论:0      收藏:0      [点我收藏+]

将数据以MySQL存储,字段名包括:公司名、新闻标题、网址、新闻来源和时间。

import time
import pymysql
import requests
from bs4 import BeautifulSoup
from requests import RequestException
     
     
def get_one_page(url):
    try:
        headers = {User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36
                   + (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36}
        response = requests.get(url, headers=headers)
        #response.encoding = response.apparent_encoding
        if response.status_code == 200:
            return response.text
        else:
            return None
        return None
    except RequestException:
        return None

     
def parse_one_page(c, text):
    soup = BeautifulSoup(text, lxml)
    titles = soup.select(.c-title > a)
    sources = soup.find_all(name=p, class_=c-author)
    companys = [阿里巴巴,京东,亚马逊,华为,贵州茅台]
    for i in range(10):
        data = {
            company: companys[c],
            title: titles[i].get_text().strip(),
            link: titles[i][href],
            source: sources[i].get_text().strip().split(\xa0)[0].strip(),
            time: sources[i].get_text().strip().split(\xa0)[2].strip()
        }
        yield data
        
def create_sql():
    db = pymysql.connect(host=localhost, user=root, password=123456, port=3306, db=spiders)
    cursor = db.cursor()
    sql=("CREATE TABLE baidu (company VARCHAR(255) NOT NULL,title VARCHAR(255) NOT NULL,link VARCHAR(255) NOT NULL,source VARCHAR(255) NOT NULL,time VARCHAR(255) NOT NULL)")
    cursor.execute(sql)
    db.close()
    
def write_to_sql(data):
    table = baidu
    keys = , .join(data.keys())
    values = , .join([%s] * len(data))
    sql = INSERT INTO {table}({keys}) VALUES ({values}).format(table=table, keys=keys, values=values)
    db = pymysql.connect(host=localhost, user=root, password=123456, port=3306, db=spiders)
    cursor = db.cursor()
    try:
        if cursor.execute(sql, tuple(data.values())):
            print(Successful)
            db.commit()
    except:
        print(Failed)
        db.rollback()
    db.close()
    
    
def main(c, url):
    for pn in range(0, 91, 10):
        link = url + &x_bfe_rqs=03E80&tngroupname=organic_news&rsv_dl=news_b_pn&pn= + str(pn)
        text = get_one_page(url)
        for item in parse_one_page(c, text):
            print(item)
            write_to_sql(item)

if __name__ == __main__:
    create_sql()
    companys = [阿里巴巴,京东,亚马逊,华为,贵州茅台]
    url = "https://www.baidu.com/s?tn=news&rtt=4&bsst=1&cl=2&wd={}&medium=0"
    urls = [url.format(com) for com in companys]
    for c,url in enumerate(urls):
        main(c, url)
        time.sleep(1)     

 

爬取5家公司(如:阿里巴巴、京东、亚马逊、华为、贵州茅台)百度“资讯”新闻的10页内容

原文:https://www.cnblogs.com/oeong/p/11722360.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!