目录
$ det http://example.python-scraping.com
'''
[{'name': 'jquery', 'version': '1.11.0'},
{'name': 'modernizr', 'version': '2.7.1'},
{'name': 'nginx', 'version': '1.12.2'}]
'''
$ docker pull wappalyzer/cli
$ docker run wappalyzer/cli http://example.python-scraping.com
import whois
print(whois.whois('url'))
import urllib.request
from urllib.error import URLError, HTTPError, ContentTooShortError
# user_agent='wswp' 设置用户代理
def download(url, num_retries=2, user_agent='wswp'):
print('Downloading:', url)
# 设置用户代理
request = urllib.request.Request(url)
request.add_header('User-agent', user_agent)
try:
html = urllib.request.urlopen(request).read()
except (URLError, HTTPError, ContentTooShortError) as e:
print('Download error:', e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
# recursively retry 5xx HTTP errors
return download(url, num_retries - 1)
return html
# 导入url解析库
import urllib.request
# 导入正则库
import re
# 导入解析错误库
from urllib.error import URLError, HTTPError, ContentTooShortError
def download(url, num_retries=2, user_agent='wswp', charset='utf-8'):
print('Downloading:', url)
request = urllib.request.Request(url)
request.add_header('User-agent', user_agent)
try:
resp = urllib.request.urlopen(request)
cs = resp.headers.get_content_charset()
if not cs:
cs = charset
html = resp.read().decode(cs)
except (URLError, HTTPError, ContentTooShortError) as e:
print('Download error:', e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
# recursively retry 5xx HTTP errors
return download(url, num_retries - 1)
return html
def crawl_sitemap(url):
# download the sitemap file
sitemap = download(url)
# extract the sitemap links
links = re.findall('<loc>(.*?)</loc>', sitemap)
# download each link
for link in links:
html = download(link)
# scrape html here
test_url = 'http://example.python-scraping.com/sitemap.xml'
crawl_sitemap(test_url)
'''
Downloading: http://example.python-scraping.com/sitemap.xml
Downloading: http://example.python-scraping.com/places/default/view/Afghanistan-1
Downloading: http://example.python-scraping.com/places/default/view/Aland-Islands-2
Downloading: http://example.python-scraping.com/places/default/view/Albania-3
Downloading: http://example.python-scraping.com/places/default/view/Algeria-4
Downloading: http://example.python-scraping.com/places/default/view/American-Samoa-5
Downloading: http://example.python-scraping.com/places/default/view/Andorra-6
Downloading: http://example.python-scraping.com/places/default/view/Angola-7
Downloading: http://example.python-scraping.com/places/default/view/Anguilla-8
Downloading: http://example.python-scraping.com/places/default/view/Antarctica-9
Downloading: http://example.python-scraping.com/places/default/view/Antigua-and-Barbuda-10
Downloading: http://example.python-scraping.com/places/default/view/Argentina-11
...
'''
import itertools
import urllib.request
from urllib.error import URLError, HTTPError, ContentTooShortError
def download(url, num_retries=2, user_agent='wswp', charset='utf-8'):
print('Downloading:', url)
request = urllib.request.Request(url)
request.add_header('User-agent', user_agent)
try:
resp = urllib.request.urlopen(request)
cs = resp.headers.get_content_charset()
if not cs:
cs = charset
html = resp.read().decode(cs)
except (URLError, HTTPError, ContentTooShortError) as e:
print('Download error:', e.reason)
html = None
if num_retries > 0:
if hasattr(e, 'code') and 500 <= e.code < 600:
# recursively retry 5xx HTTP errors
return download(url, num_retries - 1)
return html
def crawl_site(url, max_errors=5):
num_errors = 0
for page in itertools.count(1):
pg_url = '{}{}'.format(url, page)
html = download(pg_url)
if html is None:
num_errors += 1
if num_errors == max_errors:
# reached max number of errors, so exit
break
else:
num_errors = 0
# success - can scrape the result
test_url2 = 'http://example.python-scraping.com/view/-'
# 暂时存在问题,待调
crawl_sitemap(test_url2)
原文:https://www.cnblogs.com/Mario-mj/p/11756363.html