首页 > 其他 > 详细

使用selenium做简单爬虫的实例

时间:2014-04-17 00:53:57      阅读:1057      评论:0      收藏:0      [点我收藏+]

selenium 是一个Web自动化测试的软件包,可以用于自动测试Web应用,也可以用于当作简单的爬虫制作工具,

这是一个简单的demo,用于爬取Google APP Store中的一个类别:

 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from time import sleep
 
import sqlite3
import sys
 
# connect the sqlite3
 
def Conn_DB(db_name = ‘app_info.db‘):
  try:
    conn = sqlite3.connect(db_name)
  except Exception, e:
    print "Conn Error ", e
  return conn
 
# get the category of the apps
 
def Get_Category(root_address):
  url_list = root_address.split(‘/‘)
  return url_list[-1].replace("?",‘ ‘).split(‘ ‘)[0]
 
# we have to login so that to get the info from every app
 
def Login_Google(browser, category_root_address):
   
  browser.get(category_root_address)
 
  # click to login
  login_link = browser.find_element_by_id(‘gb_70‘)
  webdriver.ActionChains(browser).move_to_element(login_link).click(login_link).perform()
 
  # input your email here
  email = browser.find_element_by_name(‘Email‘)
  # you should input your email here
  email.send_keys(‘‘)
 
  # input your password here
  pwd = browser.find_element_by_name(‘Passwd‘)
  # you should input your password for your email here
  pwd.send_keys(‘‘)
  pwd.send_keys(Keys.RETURN)
 
  print ‘Login Success‘
 
 
# load the whole page and then return the number of the apps under the category
 
def Load_All_Apps(browser):
 
  # try to load the whole page to select want I want, the magic number 13 is based on the test
   
  for times in xrange(13):
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    sleep(2.5)
    browser.execute_script("window.scrollTo(0, document.body.scrollHeight * 0.5);")
    sleep(2.5)
    print times
 
    # click the show more button to load more apps
    show_more_button = browser.execute_script("return document.querySelector(‘#show-more-button‘)[‘style‘][‘cssText‘];")
    if show_more_button != ‘display: none;‘:
      browser.execute_script("document.querySelector(‘#show-more-button‘).click();")
      print ‘click button‘
    print show_more_button
 
  # to the bottom of the page
  browser.execute_script("window.scrollTo(0, 0);")
 
  number = browser.execute_script("return document.querySelectorAll(‘button.price‘).length;")
  print number
   
  return number
 
def Click_Install_Button(browser, category_root_address):
  get_permissions_code = """var permissions = document.querySelectorAll(‘.perm-description‘);
var precise_locaton = ‘precise location (GPS and network-based)‘;
var approximate_location = ‘approximate location (network-based)‘;
var ways = ‘‘;
 
for (var perm in permissions) {
    if (permissions[perm].innerHTML == precise_locaton) {
        ways += ‘p‘;
    } else if (permissions[perm].innerHTML == approximate_location) {
        ways += ‘a‘;
    }
}
return ways;"""
 
  # get all install button objects
  get_button_list_code = """return document.querySelectorAll(‘button.price‘);"""
  button_list = browser.execute_script(get_button_list_code)
  # print dir(button_list[0])
  # button_list.reverse()
 
  numbers_of_button = len(button_list)
 
  count = 0
  # index = 1
  sleep(3)
 
  #webdriver.ActionChains(browser).move_to_element(button_list[1]).click(button_list[1]).perform()
  #sleep(1)
  #browser.execute_script("document.querySelector(‘#purchase-cancel-button‘).click();")
  #webdriver.ActionChains(browser).move_to_element(button_list[3]).click(button_list[3]).perform()
  #sleep(1)
  #browser.execute_script("document.querySelector(‘#purchase-cancel-button‘).click();")
   
  category = Get_Category(category_root_address)
 
  get_app_address_code = """var app_address_list = document.querySelectorAll("h2 a");var list = [];
for (var i = 0; i < app_address_list.length; i++) {list.push(app_address_list[i][‘href‘]);} return list;"""
  address_list = browser.execute_script(get_app_address_code)
 
  conndb = Conn_DB()
  db_cursor = conndb.cursor()
 
  number_of_i_want = 0
 
  insert_sql = u"""insert into app_info (categroy, name, link, get_geo_ways) values (‘{0}‘, ‘{1}‘, ‘{2}‘, ‘{3}‘)"""
 
  for index in range(1, numbers_of_button, 2):
    try:
      webdriver.ActionChains(browser).move_to_element(button_list[index]).click(button_list[index]).perform()
      sleep(3.5)
      count += 1
      #index += 2
    except IndexError:
      print "Out of index"
      break
     
    try:
      print "Count ", count
      perms = browser.execute_script(get_permissions_code)
      sleep(2)
      appname = browser.execute_script("return document.querySelector(‘.purchase-header .title‘).innerHTML;")
      print u"App id is: ", appname , u"Perm is: ", perms, u"Address is: ", address_list[count - 1]
       
      if perms:
        sql_with_data = insert_sql.format(category, appname, address_list[count - 1], perms)
        db_cursor.execute(sql_with_data)
        conndb.commit()
        number_of_i_want += 1
         
    except Exception, e:
      print "Error for ", e, "Number is ", count, "Pers is", perms
      continue
    # click cancle button
    browser.execute_script("document.querySelector(‘#purchase-cancel-button‘).click();")
    sleep(1)
 
  print "compary ", count , numbers_of_button, "I want :", number_of_i_want
  db_cursor.close()
  conndb.close()
  # print browser.execute_script()
 
if __name__ == ‘__main__‘:
   
  driver = webdriver.Chrome()
  Login_Google(driver, root_address)
  Load_All_Apps(driver)
  Click_Install_Button(driver, root_address)
 
  #sys.exit()
 
  fd = file("./res.txt", "wb")
  fd.write("over")
  fd.close()

  

使用selenium做简单爬虫的实例,布布扣,bubuko.com

使用selenium做简单爬虫的实例

原文:http://www.cnblogs.com/jaw-crusher/p/3669387.html

(0)
(0)
   
举报
评论 一句话评论(0
关于我们 - 联系我们 - 留言反馈 - 联系我们:wmxa8@hotmail.com
© 2014 bubuko.com 版权所有
打开技术之扣,分享程序人生!