首页 > 其他 > 详细

爬虫二 cookie&正则

时间:2019-06-27 22:02:00      阅读:205      评论:0      收藏:0      [点我收藏+]

一.cookie应用实例

import urllib.request
import urllib.parse

‘‘‘带着cookie进入人人网的用户管理界面:
        1.用浏览器登录人人网
        2.下次请求时,抓包,拿到它带着的cookie
        3.编写代码,带着cookie过去
        4.如果不行,带着所有的请求信息过去(终极方案)‘‘‘
url = http://www.renren.com/971302264/profile

headers = {User-Agent: Mozilla/5.0 (Windows NT 6.3; Win64; x64) 
                         AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36,
            Cookie: anonymid=jxczgs3yw3oby9; _de=9718742970B17AD7ABC87CAAA6A740CC;
                       p=176166a1bb4a1d1a163443225f52e24e4; first_login_flag=1; ln_uact=18404904721; 
                      ln_hurl=http://head.xiaonei.com/photos/0/0/men_main.gif; 
                      t=21d77ab67402235d4282cf725f991aab4; societyguester=21d77ab67402235d4282cf725f991aab4; 
                      id=971302264; xnsid=6d1019cd; ver=7.0; loginfrom=null; JSESSIONID=abcOB4RHNlyeq8Dv_7sUw; 
                      jebe_key=2819f31f-79cc-428e-b61e-8b968e2beda4%7C920b82268747e02c45f3056eeda651c7%7C1561538325515%7C1%7C1561538325729; 
                      jebe_key=2819f31f-79cc-428e-b61e-8b968e2beda4%7C920b82268747e02c45f3056eeda651c7%7C1561538325515%7C1%7C1561538325732; wp_fold=0

}

req = urllib.request.Request(url,headers=headers)

rep = urllib.request.urlopen(req)

with open(ren.html,wb) as fp:
    fp.write(rep.read())

二、编程登录人人网

import urllib.request
import urllib.parse
import http.cookiejar

‘‘‘python登录人人网:
        1.用浏览器登录并抓包
        2.拿到目标url和post信息
        3.带着这些信息发请求‘‘‘

‘‘‘创建这样的打开器,登录时会保存cookie信息到该打开器‘‘‘
cj = http.cookiejar.CookieJar()    #创建CookieJar对象
handler = urllib.request.HTTPCookieProcessor(cj)     #创建cookie处理者
opener = urllib.request.build_opener(handler)      #创建打开器

post_url = http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=2019531649636 

form_data = {email:18404904721,
            icode:‘‘,
            origURL:http://www.renren.com/home,
            domain:renren.com,
            key_id:1,
            captcha_type:web_login,
            password:641fd8bce69ff3a3acfb14fc094fefe9487f9b4f843d18063fcce22e0a468066,
            rkey:2c3ae276413c03a1eb5159d355895bd0,
            f:http%3A%2F%2Fwww.renren.com%2F971302264%2Fprofile}

headers = {User-Agent: Mozilla/5.0 (Windows NT 6.3; Win64; x64) 
                         AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36,}

form_data = urllib.parse.urlencode(form_data).encode()       #post表单处理

req = urllib.request.Request(url=post_url,headers=headers)     #创建请求对象

rep = opener.open(req,data=form_data)            #发送post请求

# print(rep.read().decode())

‘‘‘进入用户管理界面,验证是否登录成功‘‘‘
get_url = http://www.renren.com/971302264/profile

req1 = urllib.request.Request(url=get_url,headers=headers)

rep1 = opener.open(req1)     #再次使用该打开器,里面会带着cookie

with open(guanli.html,wb) as fp:
    fp.write(rep1.read())

三、正则表达式提取内容

import re

‘‘‘()子模式‘‘‘
# string = ‘<div><span>悟空</span></div>‘
# ‘‘‘匹配上面的字符串,标签是对称的‘‘‘
# pattern = re.compile(r‘<(\w+)><(\w+)>\w+</\2></\1>‘)
# ret = pattern.search(string)
# print(ret)

‘‘‘贪婪与非贪婪‘‘‘
# string = ‘<div>八戒</div></div></div>‘
# ‘‘‘匹配上面的字符串,标签是对称的‘‘‘
# pattern1 = re.compile(r‘<div>.*</div>‘)
# pattern2 = re.compile(r‘<div>.*?</div>‘)
# ret1 = pattern1.search(string)
# ret2 = pattern2.search(string)
# print(ret1)
# print(ret2)

‘‘‘re.M多行匹配‘‘‘
string = ‘‘‘beautiful‘
beach‘‘‘
pattern = re.compile(r^bea,re.M)
ret = pattern.findall(string)
print(ret)

‘‘‘re.S单行匹配‘‘‘
# string = ‘<div>《沁园春-雪》‘ \
#          ‘北国风光,千里冰封,万里雪飘。‘ \
#          ‘望长城内外,惟余莽莽。‘ \
#          ‘大河上下,顿失滔滔。</div>‘
# pattern = re.compile(r‘.*‘,re.S)
# ret = pattern.search(string)
# print(ret)

‘‘‘re.I 单忽略大小写‘‘‘
# string = ‘Life Is Short You Must Be Sexy‘
# pattern = re.compile(r‘life is short you must be sexy‘,re.I)
# ret = pattern.search(string)
# print(ret)

‘‘‘正则替换‘‘‘
string = Life Is Short You Must Be Sexy
pattern = re.compile(rSexy)
ret = re.sub(pattern,sao,string)
ret2 = pattern.sub(lang,string)
print(ret)
print(ret2)

def func(a):
    ret = int(a.group())
    return str(ret - 3)
string = 最佳身高为175cm
pattern = re.compile(r\d+)
ret2 = pattern.sub(func,string)
print(ret2)

四、正则例子-爬取糗图图片

import urllib.request
import urllib.parse
import re
import  os

def create_request(url,page):
    post_url = url + str(page) +/
    header = {User-Agent: Mozilla/5.0 (Windows NT 6.3; Win64; x64) 
                         AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36}
    req = urllib.request.Request(url=post_url,headers=header)
    return req

def download_image(content):
    pattern = re.compile(r<div class="thumb">.*?<img src="(.*?)" .*?>.*?</div>,re.S)
    img_list = pattern.findall(content)
    #print(img_list)
    for img_src in img_list:
        img_url = https: + img_src      #拼接图片链接
        dirname = qiutu
        if not os.path.exists(dirname):
            os.mkdir(dirname)
        img_name = img_url.split(/)[-1]
        filepath = dirname + / + img_name
        urllib.request.urlretrieve(img_url,filepath)

def main():
    url = https://www.qiushibaike.com/pic/page/

    start_page = int(input(起始页码:))
    end_page = int(input(结束页码:))

    for page in range(start_page,end_page):
        print(第%s页开始下载... %page)
        #创建请求
        req = create_request(url,page)

        #发送请求,得到内容
        rep = urllib.request.urlopen(req).read().decode()

        #解析内容,下载图片
        download_image(rep)
        print(第%s页结束下载... % page)

if __name__ == __main__:
    main()

五、正则例子-爬取语录

import urllib.request
import urllib.parse
import re
import  os

def create_request(url,page=None):
    if page != None:
        url = url + str(page) + .html
    #print(post_url)
    header = {User-Agent: Mozilla/5.0 (Windows NT 6.3; Win64; x64) 
                         AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36}

    req = urllib.request.Request(url=url,headers=header)
    return req

def get_content(href):
    request = create_request(href)
    content_html = urllib.request.urlopen(request).read().decode()
    pattern = re.compile(r<div class="neirong">(.*?)</div>, re.S)
    content_list = pattern.findall(content_html)
    #print(content_list)
    pat = re.compile(r<img .*?>)
    text = pat.sub(‘‘,content_list[0])
    return text

def parse_html(content):

    #正则筛选内容
    pattern = re.compile(r<h3><a href="/mingrenjingdianyulu/(\d+/\d+/\d+\.html)"><b>(.*?)</b></a></h3>, re.S)
    title_list = pattern.findall(content)
    #print(title_list)

    for i in title_list:

        href = http://www.yikexun.cn/mingrenjingdianyulu/ + i[0]     # 拼接内容的跳转链接
        title = i[1]

        #向href发送请求,获取内容
        content = get_content(href)

        #写入文件
        string = <!DOCTYPE html>                  <html lang="en">                  <head>                    <meta charset="UTF-8">                    <title>Title</title>                  </head>                  <body>                    <h1>%s</h1>%s                  </body> %(title,content)

        with open(yulu.html,a,encoding=utf8) as fp:
            fp.write(string)

def main():
    url = http://www.yikexun.cn/mingrenjingdianyulu/list_10_

    start_page = int(input(起始页码:))
    end_page = int(input(结束页码:))

    for page in range(start_page,end_page+1):
        print(第%s页开始下载... %page)

        #创建请求
        req = create_request(url,page)

        #发送请求,得到内容
        rep = urllib.request.urlopen(req).read().decode()

        #解析内容,下载图片
        parse_html(rep)
        print(第%s页结束下载... % page)

if __name__ == __main__:
    main()

 

爬虫二 cookie&正则

原文:https://www.cnblogs.com/Finance-IT-gao/p/11099529.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!