selenium 是一个Web自动化测试的软件包,可以用于自动测试Web应用,也可以用于当作简单的爬虫制作工具,
这是一个简单的demo,用于爬取Google APP Store中的一个类别:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171 |
# -*- coding: utf-8 -*- from selenium import
webdriver from selenium.webdriver.common.keys import
Keys from selenium.webdriver.support.ui import
WebDriverWait from time import
sleep import
sqlite3 import
sys # connect the sqlite3 def
Conn_DB(db_name =
‘app_info.db‘ ): try : conn =
sqlite3.connect(db_name) except
Exception, e: print
"Conn Error " , e return
conn # get the category of the apps def
Get_Category(root_address): url_list =
root_address.split( ‘/‘ ) return
url_list[ - 1 ].replace( "?" , ‘ ‘ ).split( ‘ ‘ )[ 0 ] # we have to login so that to get the info from every app def
Login_Google(browser, category_root_address): browser.get(category_root_address) # click to login login_link =
browser.find_element_by_id( ‘gb_70‘ ) webdriver.ActionChains(browser).move_to_element(login_link).click(login_link).perform() # input your email here email =
browser.find_element_by_name( ‘Email‘ ) # you should input your email here email.send_keys(‘‘) # input your password here pwd =
browser.find_element_by_name( ‘Passwd‘ ) # you should input your password for your email here pwd.send_keys(‘‘) pwd.send_keys(Keys.RETURN) print
‘Login Success‘ # load the whole page and then return the number of the apps under the category def
Load_All_Apps(browser): # try to load the whole page to select want I want, the magic number 13 is based on the test for
times in
xrange ( 13 ): browser.execute_script( "window.scrollTo(0, document.body.scrollHeight);" ) sleep( 2.5 ) browser.execute_script( "window.scrollTo(0, document.body.scrollHeight * 0.5);" ) sleep( 2.5 ) print
times # click the show more button to load more apps show_more_button =
browser.execute_script( "return document.querySelector(‘#show-more-button‘)[‘style‘][‘cssText‘];" ) if
show_more_button ! =
‘display: none;‘ : browser.execute_script( "document.querySelector(‘#show-more-button‘).click();" ) print
‘click button‘ print
show_more_button # to the bottom of the page browser.execute_script( "window.scrollTo(0, 0);" ) number =
browser.execute_script( "return document.querySelectorAll(‘button.price‘).length;" ) print
number return
number def
Click_Install_Button(browser, category_root_address): get_permissions_code =
"""var permissions = document.querySelectorAll(‘.perm-description‘); var precise_locaton = ‘precise location (GPS and network-based)‘; var approximate_location = ‘approximate location (network-based)‘; var ways = ‘‘; for (var perm in permissions) { if (permissions[perm].innerHTML == precise_locaton) { ways += ‘p‘; } else if (permissions[perm].innerHTML == approximate_location) { ways += ‘a‘; } } return ways;""" # get all install button objects get_button_list_code =
"""return document.querySelectorAll(‘button.price‘);""" button_list =
browser.execute_script(get_button_list_code) # print dir(button_list[0]) # button_list.reverse() numbers_of_button =
len (button_list) count =
0 # index = 1 sleep( 3 ) #webdriver.ActionChains(browser).move_to_element(button_list[1]).click(button_list[1]).perform() #sleep(1) #browser.execute_script("document.querySelector(‘#purchase-cancel-button‘).click();") #webdriver.ActionChains(browser).move_to_element(button_list[3]).click(button_list[3]).perform() #sleep(1) #browser.execute_script("document.querySelector(‘#purchase-cancel-button‘).click();") category =
Get_Category(category_root_address) get_app_address_code =
"""var app_address_list = document.querySelectorAll("h2 a");var list = []; for (var i = 0; i < app_address_list.length; i++) {list.push(app_address_list[i][‘href‘]);} return list;""" address_list =
browser.execute_script(get_app_address_code) conndb =
Conn_DB() db_cursor =
conndb.cursor() number_of_i_want =
0 insert_sql =
u """insert into app_info (categroy, name, link, get_geo_ways) values (‘{0}‘, ‘{1}‘, ‘{2}‘, ‘{3}‘)""" for
index in
range ( 1 , numbers_of_button, 2 ): try : webdriver.ActionChains(browser).move_to_element(button_list[index]).click(button_list[index]).perform() sleep( 3.5 ) count + =
1 #index += 2 except
IndexError: print
"Out of index" break try : print
"Count " , count perms =
browser.execute_script(get_permissions_code) sleep( 2 ) appname =
browser.execute_script( "return document.querySelector(‘.purchase-header .title‘).innerHTML;" ) print
u "App id is: " , appname , u "Perm is: " , perms, u "Address is: " , address_list[count -
1 ] if
perms: sql_with_data =
insert_sql. format (category, appname, address_list[count -
1 ], perms) db_cursor.execute(sql_with_data) conndb.commit() number_of_i_want + =
1 except
Exception, e: print
"Error for " , e, "Number is " , count, "Pers is" , perms continue # click cancle button browser.execute_script( "document.querySelector(‘#purchase-cancel-button‘).click();" ) sleep( 1 ) print
"compary " , count , numbers_of_button, "I want :" , number_of_i_want db_cursor.close() conndb.close() # print browser.execute_script() if
__name__ = =
‘__main__‘ : driver =
webdriver.Chrome() Login_Google(driver, root_address) Load_All_Apps(driver) Click_Install_Button(driver, root_address) #sys.exit() fd =
file ( "./res.txt" , "wb" ) fd.write( "over" ) fd.close() |
使用selenium做简单爬虫的实例,布布扣,bubuko.com
原文:http://www.cnblogs.com/jaw-crusher/p/3669387.html