IP proxy can be used to prevent IP from being blocked by server due to frequent access.
Using the method, add a proxies parameter to the functions of requests:
page_text = requests.get(url=url, headers=headers, proxies={'http': '60.167.132.19:9999'} )
The free IP proxy that can be used can be obtained from the following websites:
1. Fast agent: https://www.kuaidaili.com/free/
2. West stab agent: https://www.xicidaili.com/nn/
There are others on the Internet.
Here is another code to grab the free agent, which will store the ip address of the fast agent into the data folder under the same path of the running file.
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2020/2/5 10:42 # @Author : ystraw # @Site : # @File : getIpPool.py # @Software: PyCharm Community Edition # @function: get available proxy IP from fast proxy # https://www.kuaidaili.com/free/inha/2/ import requests import time import random from bs4 import BeautifulSoup # Write to file: def writeFile(filename, file): with open(filename, 'w', encoding='utf-8') as f: f.write(file) print(filename, 'Written!') f.close() # Read in file: def readFile(filename): with open(filename, 'r', encoding='utf-8') as f: str = f.read() print(filename, 'Read in!') f.close() return str # Get the ip pool and save it to a file def download_IP(): ua_list = [ "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36", ] agent = random.choice(ua_list) headers = { "Connection": "keep-alive", "Accept": "application/json, text/javascript, */*; q=0.01", "Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02", "User-Agent": agent, } proxies = { 'HTTP': '223.199.27.122:9999', 'HTTP': '223.199.30.236:9999' } # sess = requests.session() # Collection data: IPpool = '' # Range of pages retrieved: for i in range(1, 5): try: time.sleep(random.randint(1, 2)) url = 'https://www.kuaidaili.com/free/inha/'+str(i)+'/' print('Request address:', url) response = requests.get(url, headers=headers).text # print(response) # Extract information: bs = BeautifulSoup(response, 'lxml') # Get ip list: tbody = bs.findAll('tbody')[0] trList = tbody.findAll('tr') for tr in trList: # print(tr) tdList = tr.findAll('td') for td in tdList: # print(td.string, end=' ') IPpool += td.string + ',' IPpool += '\n' except Exception as ex: print('This crawl takes ip fail', ex) # print(IPpool) if len(IPpool) > 3328: writeFile('./data/IPpool.txt', IPpool) else: print('Get quantity less than 50, not written!') # Read and organize IP files def getIP(): ipstring = readFile('./data/IPpool.txt') ipList = ipstring.split('\n') proxies = [] # 'http': '223.199.27.122:9999' for ip in ipList: if ip == '' or ip == None: continue ip = ip.split(',') try: proxies.append((ip[3], ip[0] + ':' + ip[1])) except Exception as ex: print('ip pool Build failed!', ex) return proxies # Return to IP pool def getproxies(): # Get ip from web page # download_IP() # Read saved ip proxies = getIP() # [('http', '60.167.132.19:9999'), ('http', '60.167.132.19:9999')] return proxies if __name__ == '__main__': # Get ip from web page download_IP() # Read saved ip proxies = getIP() # test url = 'https://www.baidu.com' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3730.400 QQBrowser/10.5.3805.400'} for i in range(len(proxies)): ip = proxies[i] page_text = requests.get(url=url, headers=headers, proxies={ip[0]: ip[1]} ) print(i, page_text)