Collect 15 proxy IP websites to create a free proxy IP pool

Collected sites:

Free proxy IP http://ip.yqe.com/ipproxy.htm
66 free agent network http://www.66ip.cn/
89 free agent http://www.89ip.cn/
Worry free agent http://www.data5u.com/
Cloud agent http://www.ip3366.net/
Fast agent https://www.kuaidaili.com/free/
Fast exclusive agent http://www.superfastip.com/
HTTP proxy IP https://www.xicidaili.com/wt/
Xiaoshu agent http://www.xsdaili.com
Xiladaili free agent IP http://www.xiladaili.com/
Xiaohuan HTTP proxy https://ip.ihuan.me/
Internet proxy IP http://www.goubanjia.com/
Feilong agent IP http://www.feilongip.com/

 

Collection process

Step 1: get page content

Step 2: analyze the content to get the data

Step 3: data format conversion

After the collection process is customized, create it as an abstract class for all sites to inherit it, and the subclass only needs to implement the abstract methods. This is a typical template pattern

 

Base class

from abc import ABC, abstractmethod
from typing import List
import requests
import bs4
from .model import ProxyModel


class AbsFreeProxyBase(ABC):
    # request
    http = requests

    # Initialization
    def __init__(self, url, code, **kwargs):
        """
        :param url: Request address
        :param code: Page coding
        :param kw: Additional information
        """
        self.url = url
        self.code = code
        self.kwargs = kwargs
        self.beautifulsoup = bs4.BeautifulSoup

    # Template method pattern
    # Step 1: get page content step 2: analyze content step 2: format data
    def run(self) -> List[ProxyModel]:
        text = self.get_page_text()
        soup = self.beautifulsoup(text, 'lxml')
        data = self.parse_text(soup)
        return self.to_proxy(data)

    # Get page content
    def get_page_text(self):
        res = AbsFreeProxyBase.http.get(self.url, **self.kwargs)
        if not res.ok:
            res.raise_for_status()
        return res.content.decode(self.code)

    # Parsing content
    @abstractmethod
    def parse_text(self, soup: bs4.BeautifulSoup) -> List[list]:
        pass

    # format conversion
    @abstractmethod
    def to_proxy(self, data:List[list]) -> List[ProxyModel]:
        pass

For example: fast proxy website

from .base import AbsFreeProxyBase
from typing import List
from .model import ProxyModel
import re

'''
//Quick acting
https://www.kuaidaili.com/free
'''


class WWW_KUAIDAILI_COM(AbsFreeProxyBase):

    # Initialization
    def __init__(self, url, code='utf-8', **kwargs):
        super().__init__(url, code, **kwargs)

    # Parsing content
    def parse_text(self, soup) -> List[list]:
        """
        //The format is as follows:
        IP     port(port)     Anonymity     type(HTTP/https)     position     response speed     Last validation time
        """
        regex = re.compile(r'<td[^>]*>([^<>]+)</td>')
        rows = soup.select('.table-bordered tr')
        result = []
        for row in [str(n) for n in rows]:
            item = regex.findall(row)
            item and result.append(item)
        return result

    # format conversion
    def to_proxy(self, data: List[list]) -> List[ProxyModel]:
        result = []
        for item in data:
            result.append(ProxyModel(item[3], item[0], item[1], item[2]))
        return result

 

 

 

Application of real column

from website import ProxyFactory
from browser.agent import useragent

factory = ProxyFactory()
headers = {
    'user-agent': useragent.random()
}
'''
66 Free agent network
www = factory.create('http://www.66ip.cn/mo.php?sxb=&tqsl=100&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=',
                     'gbk',
                     headers=headers)
'''

'''
//Xiaohuan HTTP proxy 
www = factory.create('https://ip.ihuan.me/',headers = headers)
'''

'''
89 Free agent http://www.89ip.cn/
www = factory.create('http://www.89ip.cn/',headers = headers)
'''

'''
//Worry free agent http://www.data5u.com/
www = factory.create('http://www.data5u.com/',headers = headers)
'''


'''
http://www.goubanjia.com/
//Whole network agent IP 
www = factory.create('http://www.goubanjia.com/',headers = headers)
'''


'''
//Cloud agent http://www.ip3366.net/
www = factory.create('http://www.ip3366.net/','gbk',headers = headers)
'''

'''
//Quick acting
https://www.kuaidaili.com/free
'''
www = factory.create('https://www.kuaidaili.com/free',headers = headers)
data = www.run()

print(data)

Baidu SkyDrive

Link: https://pan.baidu.com/s/1aNiuGpcDmgvUR2HmZKR99Q
Extraction code: 6u82

Keywords: Python network PHP

Added by webdesco on Sun, 17 Nov 2019 22:05:15 +0200