首页 > 其他 > 详细

第一个爬虫和测试

时间:2020-05-13 17:30:27      阅读:68      评论:0      收藏:0      [点我收藏+]

一、访问搜狗主页

1 import requests
2 r=requests.get("https://www.sogou.com/")
3 print(r.status_code)
4 print(r.encoding)
5 print(r.apparent_encoding)
6 r.encoding=utf-8
7 print(r.text)

技术分享图片

 1 #爬取搜狗
 2 import requests
 3 def getHTMLText(url):
 4     try: 
 5         for i in range(0,20):                   
 6             r = requests.get(url, timeout=30)
 7         r.raise_for_status() #如果状态不是200,引发异常
 8         r.encoding = utf-8 
 9         return r.status_code,r.text,type(r.text),type(r.content),len(r.text),len(r.content) #返回状态,text内容,text和content属性,content网页长度
10     except:
11         return ""
12 url = "https://www.sogou.com/"
13 print(getHTMLText(url))
1  RESTART: C:/Users/86136/Anaconda3/pkgs/python-3.7.4-h5263a28_0/Scripts/爬取搜狗.py 
2 (200, <!DOCTYPE html><html lang="cn"><head><script type="text/javascript">/*file=static/js/resourceErrorReport.js*/!function(a){var n=(new Date).getTime(),r=a.location.protocol;function c(e,t){var o=(new Date).getTime()-n;(new Image).src=["//pb.sogou.com/pv.gif?uigs_productid=wapapp&type=resource-error&stype=",e,"&timestamp=",o,"&protocol=",r,"&host=",encodeURIComponent(a.location.host),"&path=",encodeURIComponent(a.location.pathname),"&resource=",encodeURIComponent(t)].join("")}function e(e){if((e=e||a.event)&&"error"===e.type){var t=e.srcElement?e.srcElement:e.target;if(t){var o,n,r=t.tagName;"LINK"===r?(n="css",(o=t.getAttribute("href"))&&o.match(/\\.css($|\\?)/)&&c(n,o)):"SCRIPT"===r&&(n="js",(o=t.getAttribute("src"))&&o.match(/\\.js($|\\?)/)&&c(n,o))}}}r&&(r=r.substring(0,r.length-1)),a.addEventListener?a.addEventListener("error",e,!0):a.attachEvent&&a.attachEvent("onerror",e)}(window);</script><script>window._speedMark = new Date();  window.lead_ip = \‘120.238.230.35\‘;\n    window.now = 1589360471183;</script><meta charset="utf-8"><link rel="dns-prefetch" href="//img01.sogoucdn.com"><link rel="dns-prefetch" href="//img02.sogoucdn.com"><link rel="dns-prefetch" href="//img03.sogoucdn.com"><link rel="dns-prefetch" href="//img04.sogoucdn.com"><link rel="dns-prefetch" href="//dlweb.sogoucdn.com"><title>搜狗搜索引擎 - 上网从搜狗开始</title><link rel="shortcut icon" href="/images/logo/new/favicon.ico?v=4" type="image/x-icon"><meta http-equiv="X-UA-Compatible" content="IE=Edge"><link rel="search" type="application/opensearchdescription+xml" href="/content-search.xml" title="搜狗搜索"><meta name="keywords" content="搜狗搜索,网页搜索,微信搜索,视频搜索,图片搜索,音乐搜索,新闻搜索,软件搜索,问答搜索,百科搜索,购物搜索"><meta name="description" content="搜狗搜索是全球第三代互动式搜索引擎,支持微信公众号和文章搜索、知乎搜索、英文搜索及翻译等,通过自主研发的人工智能算法为用户提供专业、精准、便捷的搜索服务。"><link rel="stylesheet" type="text/css" href="//dlweb.sogoucdn.com/pcsearch/web/index/css/index_style_9454fa6.css"><style>.wrapper .suggestion{border:1px solid #e8e8e8;width:653px;-moz-box-shadow:0 1px 8px rgba(0,0,0,.1);-webkit-box-shadow:0 1px 8px rgba(0,0,0,.1);box-shadow:0 1px 8px rgba(0,0,0,.1);border-top-left-radius:0;border-top-right-radius:0;border-bottom-right-radius:2px;border-bottom-left-radius:2px;top:43px}.wrapper .suglist{width:206px}.wrapper .suglist .keyword{color:#7a77c8}.big-scn .suggestion{width:820px}.big-scn .suglist{width:236px}.wrapper .suglist{padding:4px 0}input[type=text]::-ms-clear{display:none}</style></head><body color-style="white"><div class="wrapper " id="wrap"><div class="header"> <div class="top-nav"><ul><li><a onclick="st(this,\‘40030300\‘,\‘news\‘)" href="http://news.sogou.com" uigs-id="nav_news" id="news">新闻</a></li><li class="cur"><span>网页</span></li><li><a onclick="st(this,\‘73141200\‘,\‘weixin\‘)" href="http://weixin.sogou.com/" uigs-id="nav_weixin" id="weixinch">微信</a></li><li><a onclick="st(this,\‘40051200\‘,\‘zhihu\‘)" href="http://zhihu.sogou.com/" uigs-id="nav_zhihu" id="zhihu">知乎</a></li><li><a onclick="st(this,\‘40030500\‘,\‘pic\‘)" href="http://pic.sogou.com" uigs-id="nav_pic" id="pic">图片</a></li><li><a onclick="st(this,\‘40030600\‘,\‘video\‘)" href="https://v.sogou.com/" uigs-id="nav_v" id="video">视频</a></li><li><a href="http://mingyi.sogou.com?fr=common_index_nav" uigs-id="nav_mingyi" id="mingyi" onclick="st(this,\‘\‘,\‘myingyi\‘)">明医</a></li><li><a href="http://english.sogou.com?fr=pcweb_index_nav" uigs-id="nav_overseas" id="overseas" onclick="st(this,\‘\‘,\‘overseas\‘)">英文</a></li><li><a onclick="st(this,\‘web2ww\‘,\‘wenwen\‘)" href="https://wenwen.sogou.com/?ch=websearch" uigs-id="nav_wenwen" id="index_more_wenwen">问问</a></li><li><a href="http://scholar.sogou.com?fr=common_index_nav" uigs-id="nav_scholar" id="scholar" onclick="st(this,\‘\‘,\‘scholar\‘)">学术</a></li><li class="show-more"><a href="javascript:void(0);" id="more-product">更多<i class="m-arr"></i></a><div class="pos-more" id="products-box" style="top:40px"><span class="ico-san"></span><a onclick="st(this,\‘40031000\‘)" href="http://map.sogou.com" uigs-id="nav_map" id="map">地图</a><a onclick="st(this,\‘40031500\‘)" href="http://gouwu.sogou.com/" uigs-id="nav_gouwu" id="index_more_gouwu">购物</a><a onclick="st(this,\‘40051203\‘)" href="http://baike.sogou.com/Home.v" uigs-id="nav_baike" id="index_more_baike">百科</a><a onclick="st(this)" href="http://zhishi.sogou.com" uigs-id="nav_zhishi" id="index_more_zhishi">知识</a><a onclick="st(this,\‘40051205\‘)" href="http://as.sogou.com/" uigs-id="nav_app" id="index_more_appli">应用</a><a onclick="st(this,\‘40051205\‘,\‘fanyi\‘)" href="http://fanyi.sogou.com?fr=common_index_nav_pc" uigs-id="nav_fanyi" id="index_more_fanyi">翻译</a><a href="http://index.sogou.com" uigs-id="nav_index" id="index_more_index">指数</a>  <a href="http://dangjian.sogou.com" uigs-id="nav_dangjian" id="dangjian" onclick="st(this,\‘\‘,\‘dangjian\‘)">党建</a>  <span class="all"><a onclick="st(this,\‘40051206\‘)" href="http://www.sogou.com/docs/more.htm?v=1" uigs-id="nav_all" target="_blank">全部</a></span></div></li></ul></div><div class="user-box"><div class="local-weather" id="local-weather"><div class="wea-box" id="cur-weather" style="display:none"></div>  <div class="pos-more" id="detail-weather" style="top:40px;left:-80px"></div>  </div><span class="line" id="user-box-line" style="display:none"></span><div class="user-enter"><a href="javascript:void(0);" id="show-card"  style="display:none"  uigs-id="settings_show-card">显示卡片</a>  <a href="javascript:void(0);" class="enter" id="loginBtn">登录</a>  </div></div></div><div class="content" id="content"><div class="pos-header" id="top-float-bar"><div class="part-one"></div><div class="part-two" id="card-tab-layer"><div class="c-top" id="top-card-tab"></div></div></div><div class="logo2" id="logo-s"><span></span></div><div class="logo" id="logo-l"><span></span></div> <div class="search-box querybox-focus" id="search-box"><form action="/web" name="sf" id="sf"><span class="sec-input-box"><input type="text" class="sec-input active" name="query" id="query" maxlength="100" len="80" autocomplete="off"></span><span class="enter-input"><input type="submit" value="搜狗搜索" id="stb"></span><input type="hidden" name="_asf" value="www.sogou.com"> <input type="hidden" name="_ast"> <input type="hidden" name="w" value="01019900"> <input type="hidden" name="p" value="40040100"> <input type="hidden" name="ie" value="utf8">  <input type="hidden" name="from" value="index-nologin">  <input type="hidden" name="s_from" value="index"><div class="keywords-tips" id="keywordsTips" style="display:none"><i></i><p>“<strong id="keywordsTipsStrong">369</strong>”后面的文字被忽略,搜狗的查询限制在40个汉字以内。</p></div></form></div>  </div><div class="card-box" id="card-box" style="display:none"><div class="card-box2" id="card-box2"><div class="c-top" id="card-tab-box"><a href="javascript:void(0);" uigs-id="settings_close-card" id="close-card" class="shezhi"></a></div><div class="c-main" id="card-content"></div></div></div><div class="loog-more" id="scroll-more" style="display:none"><a href="javascript:void(0);" uigs-id="scroll-more">滚动查看更多<br><span class="ico_san"></span></a></div><div class="ft" id="footer"  style="display:none" ><a href="http://b.sogou.com/" target="_blank" uigs-id="footer_tuiguang">企业推广</a><span class="line"></span><a href="http://corp.sogou.com/" target="_blank" uigs-id="footer_about">关于搜狗</a><span class="line"></span><a href="http://ir.sogou.com/" target="_blank" uigs-id="footer_aboutEnglish">About Sogou</a><span class="line"></span><a href="http://www.sogou.com/docs/terms.htm?v=1" target="_blank" uigs-id="footer_disclaimer">免责声明</a><span class="line"></span><a href="http://fankui.help.sogou.com/index.php/web/web/index/type/4" target="_blank" uigs-id="footer_feedback">意见反馈及投诉</a><span class="line"></span><a href="http://corp.sogou.com/private.html" target="_blank" uigs-id="footer_private">隐私政策</a><br>&copy;&nbsp;2004-2020&nbsp;Sogou.com&nbsp;/&nbsp;<span class="g">京网文 (2016) 6432-852号</span>&nbsp;/&nbsp;<a href="http://www.miibeian.gov.cn" target="_blank" class="g">京ICP证050897号</a><br><span class="g">(京)-经营性-2016-0019</span>&nbsp;/&nbsp;<a href="http://www.miibeian.gov.cn/" target="_blank" class="g">京ICP备11001839号-1</a>&nbsp;/&nbsp;<a href="http://www.beian.gov.cn/portal/registerSystemInfo?recordcode=11000002000025" class="ba" target="_blank">京公网安备11000002000025号</a></div>  <div class="ft-v1" id="QRcode-footer" style="padding-bottom:28px"><div class="erwm-box"><span class="ewm"><img src="/web/index/images/erweima2.png" ></span><div class="erwx"><p>下载搜狗搜索APP</p></div></div><div class="ft-info"><a uigs-id="mid_pinyin" href="http://pinyin.sogou.com/" target="_blank"><i class="i1"></i>搜狗输入法</a><span class="line"></span><a uigs-id="mid_liulanqi" href="http://ie.sogou.com/" target="_blank"><i class="i2"></i>浏览器</a><span class="line"></span><a uigs-id="mid_daohang" href="http://123.sogou.com/" target="_blank"><i class="i3"></i>网址导航</a><br><a href="http://corp.sogou.com/" target="_blank" class="g">关于搜狗</a>&nbsp;-&nbsp;<a href="http://ir.sogou.com/" target="_blank" class="g">About Sogou</a>&nbsp;-&nbsp;<a href="http://b.sogou.com/" target="_blank" class="g">企业推广</a>&nbsp;-&nbsp;<a href="http://www.sogou.com/docs/terms.htm?v=1" target="_blank" class="g">免责声明</a>&nbsp;-&nbsp;<a href="http://fankui.help.sogou.com/index.php/web/web/index/type/4" target="_blank" class="g">意见反馈及投诉</a>&nbsp;-&nbsp;<a href="http://corp.sogou.com/private.html" target="_blank" class="g" uigs-id="footer_private">隐私政策</a><br>&copy;&nbsp;2004-2020&nbsp;Sogou.com&nbsp;/&nbsp;<span class="g">京网文 (2016) 6432-852号</span>&nbsp;/&nbsp;<span class="g">(京)-经营性-2016-0019</span><br><span class="g">京ICP证050897号</span>&nbsp;/&nbsp;<span class="g">京ICP备11001839号-1</span>&nbsp;/&nbsp;<a href="http://www.beian.gov.cn/portal/registerSystemInfo?recordcode=11000002000025" class="ba" target="_blank">京公网安备11000002000025号</a></div></div> <div class="kuozhan" id="QRcode-box" style="display:none"><a href="javascript:void(0);" id="miniQRcode"></a><span id="QRcode"></span></div><a href="javascript:void(0);" class="back-top" id="back-top"></a></div> <script>var SugPara, uigs_para, msBrowserName = navigator.userAgent.toLowerCase(),msIsSe = false,msIsMSearch = false, hasDoodle = false, queryinput = document.getElementById(\‘query\‘);</script><script>/*file=static/js/indexjs.js*/function indexjsInit(e,o,n,a,t,i,s,u){var r={puid:t,cards:i,cards_sw:s,uigs_cookie:"SUID,sct,SUV"};function c(){try{window.external.metasearch("make_connection","www.google.com.hk")}catch(e){}}uigs_para={uigs_productid:"webapp",type:"webindex_new",stype:e?"login":"nologin",scrnwi:screen.width,scrnhi:screen.height,uigs_pbtag:"A",uigs_cookie:"SUID,sct",protocol:"https:"==location.protocol.toLowerCase()?"https":"http"},e&&(uigs_para=Object.assign(uigs_para,r),window.loginCardConfig={show:"1"===s,cardTab:[{text:"推荐",id:"card-news-tab",initFn:"newsInit",className:"cur"},{text:"导航",id:"card-nav-tab",initFn:"navInit"}]}),SugPara={queryboxid:"search-box",enableSug:!0,sugType:"web",domain:"w.sugg.sogou.com",productId:"web",sugFormName:"sf",inputid:"query",submitId:"stb",suggestRid:"01015002",normalRid:"01019900",useParent:1,sugglocation:"index",showVr:!0,showHotwords:!0,suggAbtestObject:o},/se 2\\.x/i.test(msBrowserName)&&(msIsSe=!0),/metasr/i.test(msBrowserName)&&(msIsMSearch=!0),queryinput&&msIsSe&&msIsMSearch&&(queryinput.addEventListener?(queryinput.addEventListener("keypress",c,!1),queryinput.addEventListener("keydown",c,!1)):queryinput.attachEvent?(queryinput.attachEvent("onkeypress",c),queryinput.attachEvent("onkeydown",c)):(queryinput.onkeypress=c,queryinput.onkeydown=c)),window.m_s_index=function(){var e=document.sf.query,o=Math.round(1e3*((new Date).getTime()+Math.random()));e.focus(),new RegExp("kw=([^&]+)").test(location.search)&&0==e.value.length&&(e.value=decodeURIComponent(RegExp.$1)),document.cookie.indexOf("SUV=")<0&&(document.cookie="SUV="+o+";path=/;expires=Sun, 29 July 2026 00:00:00 UTC;domain="+function(){var e=document.domain;return e.indexOf("sogou.com")==e.length-9?".sogou.com":e.indexOf("soso.com")==e.length-8?".soso.com":-1!=e.indexOf("sogo.com")?".sogo.com":void 0}()),n&&((new Image).src="//pb6.sogou.com/v6")},window.st=function(e,o,n,t){var i=document.sf.query,s=encodeURIComponent(i.value),u={news:"http://news.sogou.com/news?ie=utf8&query=",web:"web?ie=utf8&query=",weixin:"http://weixin.sogou.com/weixin?type=2&ie=utf8&query=",zhihu:"http://zhihu.sogou.com/zhihu?ie=utf8&query=",pic:"http://pic.sogou.com/pics?ie=utf8&query=",video:"https://v.sogou.com/v?ie=utf8&query=",myingyi:"https://www.sogou.com/web?m2web=mingyi.sogou.com&ie=utf8&query=",overseas:"http://english.sogou.com?b_o_e=1&ie=utf8&fr=pcweb_index_nav&query=",scholar:"http://scholar.sogou.com?ie=utf8&fr=common_index_nav&query=",fanyi:"http://fanyi.sogou.com/?fr=common_index_nav_pc&ie=utf8&keyword=",wenwen:"http://wenwen.sogou.com/s/?ch=websearch&w=",dangjian:a},r=u[n]||e.href;function c(e){return-1<e.indexOf("?")?"&":"?"}i&&""!==i.value&&(u[n]?r=u[n]+s:0<r.indexOf("kw=")?r=r.replace(new RegExp("kw=[^&$]*"),"kw="+s):r+=c(r)+"kw="+s),o&&(r+=c(r)+"p="+o),t&&0<t.length&&(r+="#"+t),!i||""!=i.value||"wenwen"!=n&&"dangjian"!=n||(r=e.href),e.href=r},window.cid=function(e,o){var n=document.sf.query,t=encodeURIComponent(n.value);t?"web2ww"===o?e.href+="s/?cid=web2ww&w="+t:"web2bk"===o&&(e.href+="Search.e?sp=S"+t+"&cid=web2bk"):e.href+="?cid="+o},window.m_s_index()}indexjsInit(false, {"suggestHistoryStrategy1":"","suggestHistoryStrategy2":"0|1|2|3|4|5|6|7|8","suggHistoryAbtest":""}, true, \‘http://dangjian.sogou.com/dangjian?query=\‘, \‘invaliduser\‘, \‘\‘, \‘\‘);</script><script src="//dlweb.sogoucdn.com/pcsearch/web/index/js/suggbase_b9937f7.js"></script>  <script src="//dlweb.sogoucdn.com/pcsearch/js/common/widget/index_login_a7ce741.js"></script><script src="//account.sogou.com/static/api/passport-async.js"></script>  <script src="//dlweb.sogoucdn.com/pcsearch/web/index/js/searchbase_211ecd8.js"></script>  </body></html><!--zly-->, 
<class str>,#text属性
<class bytes>, #content属性
13877, 14531)#content所返回网页的长度

第一个爬虫和测试

原文:https://www.cnblogs.com/wendy123/p/12883566.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!