selenium
basic operation
from selenium import webdriver
from time import sleep
#Instantiate a browser object
bro = webdriver.Chrome(executable_path=r'C:\pycahrm file\chromedriver.exe')
url = 'https://www.jd.com/'
#User initiated request
bro.get(url)
#Location tag
search_input = bro.find_element_by_id('key')
#Data interaction with the specified label
search_input.send_keys('macPro')
btn = bro.find_element_by_xpath('//*[@id="search"]/div/div[2]/button')
btn.click()
sleep(2)
#Execute js code
jsCode = 'window.scrollTo(0,document.body.scrollHeight)'
bro.execute_script(jsCode)
sleep(3)
bro.quit()
selenium
-Concept: a module based on browser automation.
-Installation of environment:
- pip install selenium
-Association between selenium and reptiles:
-Simulate login
-Easy to capture dynamically loaded data (key points)
Page source data obtained: page u source
-Features: visible and available
-Disadvantages: low efficiency
-Specific use of selenium
-Prepare the browser driver: http://chromedriver.storage.googleapis.com/index.html
-Action chain: ActionChains, a series of actions
-Use process:
-To instantiate an action connection object, you need to bind the specified browser and action connection object
-Perform related continuous actions
-perform() executes the action immediately
Sliding operation
Method 1:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains #Action chain
import time
bro = webdriver.Chrome(executable_path=r'C:\pycahrm file\chromedriver.exe')
bro.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
bro.implicitly_wait(3)
bro.switch_to.frame('iframeResult') #Switch to frame
frame_tag = bro.find_element_by_id('draggable') #If you do not switch to iframe, the label cannot be found
begin_tag = bro.find_element_by_id('draggable') #Starting position of sliding block
end_tag = bro.find_element_by_id('droppable')#Stop position of slide block
actions = ActionChains(bro) #Get the action chain object
actions.drag_and_drop(begin_tag,end_tag) #Put the action in the action chain and prepare for serial execution
actions.perform() #Start execution
time.sleep(2)
bro.quit()
Mode two:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains
import time
bro = webdriver.Chrome(executable_path=r'C:\pycahrm file\chromedriver.exe')
bro.get('http://www.runoob.com/try/try.php?filename=jqueryui-api-droppable')
bro.implicitly_wait(3)
bro.switch_to.frame('iframeResult')
frame_tag = bro.find_element_by_id('draggable')
begin_tag = bro.find_element_by_id('draggable')
end_tag = bro.find_element_by_id('droppable')
ActionChains(bro).click_and_hold(begin_tag).perform() #Slide block at start position, click and hold
distance = end_tag.location['x']-begin_tag.location['x']
#Sliding distance
trsck = 0
while trsck < distance: ActionChains(bro).move_by_offset(xoffset=50,yoffset=0).perform() #xoffset horizontal sliding distance is 50 pixels
trsck += 50
ActionChains(bro).release().perform() #Slide end, release slide block
time.sleep(2)
bro.quit()
Simulated Login based on selenium 12306
import requests
from hashlib import md5
class Chaojiying_Client(object):
def __init__(self, username, password, soft_id):
self.username = username
password = password.encode('utf8')
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
'user': self.username,
'pass2': self.password,
'softid': self.soft_id,
}
self.headers = {
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)',
}
def PostPic(self, im, codetype):
"""
im: Picture byte
codetype: Topic type reference http://www.chaojiying.com/price.html
"""
params = {
'codetype': codetype,
}
params.update(self.base_params)
files = {'userfile': ('ccc.jpg', im)}
r = requests.post('http://upload.chaojiying.net/Upload/Processing.php', data=params, files=files, headers=self.headers)
return r.json()
def ReportError(self, im_id):
"""
im_id:Picture of wrong topic ID
"""
params = {
'id': im_id,
}
params.update(self.base_params)
r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
return r.json()
from selenium import webdriver
from selenium.webdriver import ActionChains
from time import sleep
from PIL import Image #Install PIL or pilot
from CJY import Chaojiying_Client
#A function encapsulating a verification code
def transformCode(imgPath,imgType):
chaojiying = Chaojiying_Client('username', 'password', '902590')
im = open(imgPath, 'rb').read()
return chaojiying.PostPic(im, imgType)['pic_str']
bro = webdriver.Chrome(executable_path=r'C:\pycahrm file\chromedriver.exe')
bro.get('https://kyfw.12306.cn/otn/login/init')
sleep(2)
#Save the current browser page as a picture
bro.save_screenshot('./main.png')
#Crop the local area of the verification code
#Capture the location information of the label in the page
img_tag = bro.find_element_by_xpath('//*[@id="loginForm"]/div/ul[2]/li[4]/div/div/div[3]/img')
location = img_tag.location#Starting position coordinate of label (lower left corner coordinate)
size = img_tag.size#Size of label
#Rectangular region corresponding to clipping range
rangle = (int(location['x']),int(location['y']),int(location['x']+size['width']),int(location['y']+size['height']))
#Use the Image tool to crop the specified area
i = Image.open('./main.png')
frame = i.crop(rangle)#crop is to intercept the image according to the specified clipping range
frame.save('code.png')
#Calling the coding platform to identify the verification code
result = transformCode('./code.png',9004)
print(result) #x1,y1|x2,y2|x3,y3
#x1,y1|x2,y2|x3,y3 ==>[[x1,y1],[x2,y2],[x3,y3]]
all_list = []#[[x1,y1],[x2,y2],[x3,y3]]
if '|' in result:
list_1 = result.split('|')
count_1 = len(list_1)
for i in range(count_1):
xy_list = []
x = int(list_1[i].split(',')[0])
y = int(list_1[i].split(',')[1])
xy_list.append(x)
xy_list.append(y)
all_list.append(xy_list)
else:
x = int(result.split(',')[0])
y = int(result.split(',')[1])
xy_list = []
xy_list.append(x)
xy_list.append(y)
all_list.append(xy_list)
for point in all_list:
x = point[0]
y = point[1]
ActionChains(bro).move_to_element_with_offset(img_tag,x,y).click().perform()
sleep(1)
bro.find_element_by_id('username').send_keys('xxxxxx')
sleep(1)
bro.find_element_by_id('password').send_keys('xxxx')
sleep(1)
bro.find_element_by_id('loginSub').click()
sleep(10)
print(bro.page_source)
bro.quit()
selenium risk aversion
Evasion detection
from selenium import webdriver
from selenium.webdriver import ChromeOptions
option = ChromeOptions()
option.add_experimental_option('excludeSwitches', ['enable-automation'])
bro = webdriver.Chrome(executable_path='./chromedriver.exe',options=option)
url = 'https://www.taobao.com/'
bro.get(url)
# When a request is initiated by a crawler, inject js of window.navigator.webdriver in the background, and the return value is true
# Normal access to a page is to inject js and return an undefind
Headless browser
#Headless browser
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from time import sleep
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
bro = webdriver.Chrome(executable_path='./chromedriver.exe',chrome_options=chrome_options) #No visible page, no visual page
url = 'https://www.taobao.com/'
bro.get(url)
sleep(2)
bro.save_screenshot('123.png')
print(bro.page_source)