首页 > 编程语言 > 详细

python随笔(一)

时间:2019-04-04 12:57:43      阅读:281      评论:0      收藏:0      [点我收藏+]

python爬虫获取QQ音乐和豆瓣的最新电影音乐名字

先上代码开源大家一起学习,代码如下:

#!python2
#coding:utf-8
__author__ = OldHarry

import urllib2
import os
import re
import json
import xlsxwriter
import sys
defaultencoding = utf-8
if sys.getdefaultencoding() != defaultencoding:
    reload(sys)
    sys.setdefaultencoding(defaultencoding)

def getHtml(url):
    send_headers = {
     User-Agent:Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0,
     Accept:text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8,
     Connection:keep-alive
    }
    urls = urllib2.Request(url,headers=send_headers)
    html = urllib2.urlopen(urls)
    if html.getcode() == 200:
        pass
        #print ("已捕获"),url,"目标站数据..."
    else:
        print ("访问出现错误...错误代码:"),html.getcode()
    return html.read()


def kugoumusic(url):
    xx=getHtml(url)
    rr=re.compile(r<span class="songName">(.*?) - (.*?)</span>)
    x=rr.findall(xx)
    nk=[]
    for xxx in  x:
        if xxx not in nk:
            nk.append(xxx[1].decode(utf8))
    print json.dumps(nk, encoding="UTF-8", ensure_ascii=False)
    return nk


def qqmusic(url):
    xx=getHtml(url)
    rr=re.compile(r{"action":{"alert":[0-9]+,"icons":[0-9]+,"msgdown":[0-9]+,"msgfav":[0-9]+,"msgid":[0-9]+,"msgpay":[0-9]+,"msgshare":[0-9]+,"switch":[0-9]+},"album":{"id":[0-9]+,"mid":"[a-zA-Z0-9]+","name":"(.*?)")
    x=rr.findall(xx)
    nq=[]
    for xxx in x:
        xxx.strip()
        if xxx not in nq:
            nq.append(xxx)
    print json.dumps(nq, encoding="UTF-8", ensure_ascii=False)
    return nq

def dbmovie(url):
    ssd = getHtml(url)
    tt=re.compile(ralt="(.*?)" rel="[a-z]+" class="" />)
    shu=tt.findall(ssd)
    print json.dumps(shu, encoding="UTF-8", ensure_ascii=False)
    return shu

def rmmovie(url):
    ssd = getHtml(url)
    tt=re.compile(r"title":"(.*?)")
    shu=tt.findall(ssd)
    print json.dumps(shu, encoding="UTF-8", ensure_ascii=False)
    return shu
def rmdsj():
    ssd = rmmovie(https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0)+rmmovie(https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=20)+rmmovie(https://movie.douban.com/j/search_subjects?type=tv&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=40)
    return ssd

def runtest():
    IP_PATH = os.path.abspath(.) + \TXT.xls
    print "酷狗音乐--新歌榜"
    a=kugoumusic("http://www.kugou.com/")
    print "腾讯音乐--内地新歌榜"
    b=qqmusic("https://u.y.qq.com/cgi-bin/musicu.fcg?-=recom2388477980207393&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%7D%2C%22new_song%22%3A%7B%22module%22%3A%22QQMusic.MusichallServer%22%2C%22method%22%3A%22GetNewSong%22%2C%22param%22%3A%7B%22type%22%3A1%7D%7D%7D")
    print "腾讯音乐--港台新歌榜"
    c=qqmusic("https://u.y.qq.com/cgi-bin/musicu.fcg?-=recom6698628102261504&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%7D%2C%22new_song%22%3A%7B%22module%22%3A%22QQMusic.MusichallServer%22%2C%22method%22%3A%22GetNewSong%22%2C%22param%22%3A%7B%22type%22%3A2%7D%7D%7D")
    print "腾讯音乐--欧美新歌榜"
    d=qqmusic("https://u.y.qq.com/cgi-bin/musicu.fcg?-=recom08419989487702839&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%7D%2C%22new_song%22%3A%7B%22module%22%3A%22QQMusic.MusichallServer%22%2C%22method%22%3A%22GetNewSong%22%2C%22param%22%3A%7B%22type%22%3A3%7D%7D%7D")
    print "腾讯音乐--日本新歌榜"
    e=qqmusic("https://u.y.qq.com/cgi-bin/musicu.fcg?-=recom24411354608866187&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%7D%2C%22new_song%22%3A%7B%22module%22%3A%22QQMusic.MusichallServer%22%2C%22method%22%3A%22GetNewSong%22%2C%22param%22%3A%7B%22type%22%3A4%7D%7D%7D")
    print "腾讯音乐--韩国新歌榜"
    f=qqmusic("https://u.y.qq.com/cgi-bin/musicu.fcg?-=recom909302436024819&g_tk=5381&loginUin=0&hostUin=0&format=json&inCharset=utf8&outCharset=utf-8&notice=0&platform=yqq.json&needNewCode=0&data=%7B%22comm%22%3A%7B%22ct%22%3A24%7D%2C%22new_song%22%3A%7B%22module%22%3A%22QQMusic.MusichallServer%22%2C%22method%22%3A%22GetNewSong%22%2C%22param%22%3A%7B%22type%22%3A5%7D%7D%7D")
    print "豆瓣电影--正在热映"
    g=dbmovie("https://movie.douban.com/")
    print "豆瓣电影--热门电影"
    h=rmmovie("https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%83%AD%E9%97%A8&sort=recommend&page_limit=20&page_start=0")
    print "豆瓣电影--最新电影"
    i=rmmovie("https://movie.douban.com/j/search_subjects?type=movie&tag=%E6%9C%80%E6%96%B0&page_limit=20&page_start=0")
    print "豆瓣电影--经典电影"
    j=rmmovie("https://movie.douban.com/j/search_subjects?type=movie&tag=%E7%BB%8F%E5%85%B8&sort=time&page_limit=20&page_start=0")
    print "豆瓣电影--可播放电影"
    k=rmmovie("https://movie.douban.com/j/search_subjects?type=movie&tag=%E5%8F%AF%E6%92%AD%E6%94%BE&sort=time&page_limit=20&page_start=0")
    print "豆瓣电影--高分电影"
    l=rmmovie("https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=time&page_limit=20&page_start=0")
    print "豆瓣电影--热门电视剧"
    m=rmdsj()
    workbook = xlsxwriter.Workbook(IP_PATH)
    worksheet = workbook.add_worksheet()
    bold = workbook.add_format({bold: 1, align: center, border: 1})
    bold2 = workbook.add_format({align: center, border: 1})
    headings = [酷狗音乐--新歌榜, 腾讯音乐--内地新歌榜,腾讯音乐--港台新歌榜,腾讯音乐--欧美新歌榜,腾讯音乐--日本新歌榜,腾讯音乐--韩国新歌榜,豆瓣电影--正在热映,豆瓣电影--热门电影,豆瓣电影--最新电影,豆瓣电影--经典电影,豆瓣电影--可播放电影,豆瓣电影--高分电影,豆瓣电影--热门电视剧]
    worksheet.write_row(A1, headings, bold)
    SS=30
    worksheet.set_column(A:A, SS)
    worksheet.set_column(B:B, SS)
    worksheet.set_column(C:C, SS)
    worksheet.set_column(D:D, SS)
    worksheet.set_column(E:E, SS)
    worksheet.set_column(F:F, SS)
    worksheet.set_column(G:G, SS)
    worksheet.set_column(H:H, SS)
    worksheet.set_column(I:I, SS)
    worksheet.set_column(J:J, SS)
    worksheet.set_column(K:K, SS)
    worksheet.set_column(L:L, SS)
    worksheet.set_column(M:M, SS)
    worksheet.write_column(A2, a, bold2)
    worksheet.write_column(B2, b, bold2)
    worksheet.write_column(C2, c, bold2)
    worksheet.write_column(D2, d, bold2)
    worksheet.write_column(E2, e, bold2)
    worksheet.write_column(F2, f, bold2)
    worksheet.write_column(G2, g, bold2)
    worksheet.write_column(H2, h, bold2)
    worksheet.write_column(I2, i, bold2)
    worksheet.write_column(J2, j, bold2)
    worksheet.write_column(K2, k, bold2)
    worksheet.write_column(L2, l, bold2)
    worksheet.write_column(M2, m, bold2)
    workbook.close()
if __name__ == __main__:
    runtest()

主要思路是:第一步解析网站,第二步选择自己想要的数据,第三步在当前文件夹生成一个文件夹写入excl

 

 

 第一次写博客,各路大神不喜勿喷,python萌新一枚。

开发环境:Pycharm  python2.7

2019-04-0411:33:23

Study hard and make progress every day!

python随笔(一)

原文:https://www.cnblogs.com/Harrydz/p/10653926.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!