Use of 56 crawler ip agent

IP proxy can be used to prevent IP from being blocked by server due to frequent access.

Using the method, add a proxies parameter to the functions of requests:

page_text = requests.get(url=url, headers=headers, proxies={'http': '60.167.132.19:9999'} )

The free IP proxy that can be used can be obtained from the following websites:

1. Fast agent: https://www.kuaidaili.com/free/

2. West stab agent: https://www.xicidaili.com/nn/

There are others on the Internet.

Here is another code to grab the free agent, which will store the ip address of the fast agent into the data folder under the same path of the running file.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2020/2/5 10:42
# @Author  : ystraw
# @Site    : 
# @File    : getIpPool.py
# @Software: PyCharm Community Edition
# @function: get available proxy IP from fast proxy
# https://www.kuaidaili.com/free/inha/2/

import requests
import time
import random
from bs4 import BeautifulSoup

# Write to file:
def writeFile(filename, file):
    with open(filename, 'w', encoding='utf-8') as f:
        f.write(file)
    print(filename, 'Written!')
    f.close()
# Read in file:
def readFile(filename):
    with open(filename, 'r', encoding='utf-8') as f:
        str = f.read()
    print(filename, 'Read in!')
    f.close()
    return str

# Get the ip pool and save it to a file
def download_IP():
    ua_list = [
        "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36",
    ]
    agent = random.choice(ua_list)
    headers = {
        "Connection": "keep-alive",
        "Accept": "application/json, text/javascript, */*; q=0.01",
        "Referer": "https://piaofang.maoyan.com/dashboard?movieId=1211270&date=2011-01-02",
        "User-Agent": agent,
    }
    proxies = { 'HTTP': '223.199.27.122:9999',
                'HTTP': '223.199.30.236:9999' }
    # sess = requests.session()
    # Collection data:
    IPpool = ''
    # Range of pages retrieved:
    for i in range(1, 5):
        try:
            time.sleep(random.randint(1, 2))
            url = 'https://www.kuaidaili.com/free/inha/'+str(i)+'/'
            print('Request address:', url)
            response = requests.get(url, headers=headers).text
            # print(response)
            # Extract information:
            bs = BeautifulSoup(response, 'lxml')
            # Get ip list:
            tbody = bs.findAll('tbody')[0]
            trList = tbody.findAll('tr')
            for tr in trList:
                # print(tr)
                tdList = tr.findAll('td')
                for td in tdList:
                    # print(td.string, end=' ')
                    IPpool += td.string + ','
                IPpool += '\n'
        except Exception as ex:
            print('This crawl takes ip fail', ex)
        # print(IPpool)
    if len(IPpool) > 3328:
        writeFile('./data/IPpool.txt', IPpool)
    else:
        print('Get quantity less than 50, not written!')

# Read and organize IP files
def getIP():
    ipstring = readFile('./data/IPpool.txt')
    ipList = ipstring.split('\n')
    proxies = []  # 'http': '223.199.27.122:9999'
    for ip in ipList:
        if ip == '' or ip == None:
            continue
        ip = ip.split(',')
        try:
            proxies.append((ip[3], ip[0] + ':' + ip[1]))
        except Exception as ex:
            print('ip pool Build failed!', ex)
    return proxies

# Return to IP pool
def getproxies():
    # Get ip from web page
    # download_IP()
    # Read saved ip
    proxies = getIP()
    # [('http', '60.167.132.19:9999'), ('http', '60.167.132.19:9999')]
    return proxies

if __name__ == '__main__':
    # Get ip from web page
    download_IP()
    # Read saved ip
    proxies = getIP()
    # test
    url = 'https://www.baidu.com'
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3730.400 QQBrowser/10.5.3805.400'}
    for i in range(len(proxies)):
        ip = proxies[i]
        page_text = requests.get(url=url, headers=headers, proxies={ip[0]: ip[1]} )
        print(i, page_text)

 

236 original articles published, praised 16, visited 9352
Private letter follow

Keywords: Windows encoding Python Pycharm

Added by ereptur on Sat, 08 Feb 2020 16:40:07 +0200