1 # # -*- coding:utf-8 -*- 2 from lxml import etree 3 import re 4 import requests #导入requests包 5 import 爬取法律法规.SQL as SQL 6 7 def Get_urls(start,end): 8 hrefs_link = [] 9 hrefs_title = [] 10 for i in range(start,end): 11 #i=1 12 url = ‘https://www.chinacourt.org/law/more/law_type_id/MzAwNEAFAA/page/‘ + str(i) + ‘.shtml‘ 13 print(url) 14 strhtml = requests.get(url, timeout=7) 15 tree = etree.HTML(strhtml.text) 16 hreff=tree.xpath(‘//*[@id="flwk"]/div[1]/div[2]/ul//a//@href‘) 17 for hh in hreff: 18 hrefs_link.append(hh) 19 hreff_text = tree.xpath(‘//*[@id="flwk"]/div[1]/div[2]/ul//a//text()‘) 20 for hh in hreff_text: 21 hrefs_title.append(hh) 22 return hrefs_title,hrefs_link 23 if __name__ =="__main__": 24 hrefs_title,hrefs_link=Get_urls(500,534) 25 26 for num_i in range(len(hrefs_link)): 27 #num_i=15 28 print(num_i) 29 try: 30 href_url=‘https://www.chinacourt.org‘+hrefs_link[num_i] 31 href_title=hrefs_title[num_i] 32 if ("失效" in href_title) or ("主席令" in href_title): 33 continue 34 35 print(href_url) 36 print(href_title) 37 38 values = SQL.select_db(href_title) 39 if (len(values) > 0): 40 SQL.delete_db(href_title) 41 42 #/html/body/div[2]/div/div[2]/div 43 strhtml = requests.get(href_url, timeout=(7,7)) # Get方式获取网页数据 44 tree = etree.HTML(strhtml.text) 45 text=tree.xpath(‘//div[@class="content_text"]//text()‘) 46 text[0]=re.sub(r‘\xa0‘,‘‘,text[0]) 47 48 name_zhang="" 49 name_tiao="" 50 name_info="" 51 info_value=[] 52 whole_value=[] 53 Value=[] 54 for val in text: 55 val=re.sub(r‘([\xa0\r\n\t\xae\s\u3000\ue004\ue003\ufeff\ufffd])‘,‘‘,val) 56 Value.append(val) 57 # print(Value) 58 Value=[] 59 check_zhang = re.findall(r"(^第[一二三四五六七八九十百千]+章)", val) 60 check_tiao = re.findall(r"(^第[一二三四五六七八九十百千]+条)", val) 61 check_jie = re.findall(r"(^第[一二三四五六七八九十百千]+节)", val) 62 if (len(check_jie) > 0): # 章 63 continue 64 if (len(check_zhang)>0):#章 65 lsis=val.split("章") 66 name_zhang=lsis[0]+"章" 67 elif(len(check_tiao)>0):#条 68 if(len(info_value)>0): 69 whole_value.append(‘‘.join(info_value)) 70 info_value = [] 71 lsis=val.split("条") 72 name_tiao=lsis[0] 73 name_info=name_zhang+"_"+name_tiao+"条"+":" 74 value=name_info+‘‘.join(lsis[1:]) 75 info_value.append(value) 76 elif(len(info_value)>0):#条中解释 77 lsis = val.split((" ")) 78 lsi = [i for i in lsis if i != ‘‘] 79 if (len(lsi) == 1): 80 lsis = val.split("\u3000") 81 lsi = [i for i in lsis if i != ‘‘] 82 info_value.append(‘‘.join(lsi)) 83 84 for value in whole_value: 85 print("KKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKKK") 86 name_zhang=re.findall("(^第.+?章)_",value) 87 name_tiao=re.findall("_(第.+?条):",value) 88 name_info=re.findall(":(.+?)$",value) 89 if (len(name_tiao)==0) or (len(name_info)==0): 90 continue 91 if (len(name_zhang)==0): 92 name_zhang=[""] 93 SQL.insert(href_title,name_zhang[0],name_tiao[0],name_info[0]) 94 except Exception as r: 95 print(‘未知错误 %s‘ %(r)) 96 continue
原文:https://www.cnblogs.com/smartisn/p/14426584.html