Collected sites:
Free proxy IP http://ip.yqe.com/ipproxy.htm
66 free agent network http://www.66ip.cn/
89 free agent http://www.89ip.cn/
Worry free agent http://www.data5u.com/
Cloud agent http://www.ip3366.net/
Fast agent https://www.kuaidaili.com/free/
Fast exclusive agent http://www.superfastip.com/
HTTP proxy IP https://www.xicidaili.com/wt/
Xiaoshu agent http://www.xsdaili.com
Xiladaili free agent IP http://www.xiladaili.com/
Xiaohuan HTTP proxy https://ip.ihuan.me/
Internet proxy IP http://www.goubanjia.com/
Feilong agent IP http://www.feilongip.com/
Collection process
Step 1: get page content
Step 2: analyze the content to get the data
Step 3: data format conversion
After the collection process is customized, create it as an abstract class for all sites to inherit it, and the subclass only needs to implement the abstract methods. This is a typical template pattern
Base class
from abc import ABC, abstractmethod from typing import List import requests import bs4 from .model import ProxyModel class AbsFreeProxyBase(ABC): # request http = requests # Initialization def __init__(self, url, code, **kwargs): """ :param url: Request address :param code: Page coding :param kw: Additional information """ self.url = url self.code = code self.kwargs = kwargs self.beautifulsoup = bs4.BeautifulSoup # Template method pattern # Step 1: get page content step 2: analyze content step 2: format data def run(self) -> List[ProxyModel]: text = self.get_page_text() soup = self.beautifulsoup(text, 'lxml') data = self.parse_text(soup) return self.to_proxy(data) # Get page content def get_page_text(self): res = AbsFreeProxyBase.http.get(self.url, **self.kwargs) if not res.ok: res.raise_for_status() return res.content.decode(self.code) # Parsing content @abstractmethod def parse_text(self, soup: bs4.BeautifulSoup) -> List[list]: pass # format conversion @abstractmethod def to_proxy(self, data:List[list]) -> List[ProxyModel]: pass
For example: fast proxy website
from .base import AbsFreeProxyBase from typing import List from .model import ProxyModel import re ''' //Quick acting https://www.kuaidaili.com/free ''' class WWW_KUAIDAILI_COM(AbsFreeProxyBase): # Initialization def __init__(self, url, code='utf-8', **kwargs): super().__init__(url, code, **kwargs) # Parsing content def parse_text(self, soup) -> List[list]: """ //The format is as follows: IP port(port) Anonymity type(HTTP/https) position response speed Last validation time """ regex = re.compile(r'<td[^>]*>([^<>]+)</td>') rows = soup.select('.table-bordered tr') result = [] for row in [str(n) for n in rows]: item = regex.findall(row) item and result.append(item) return result # format conversion def to_proxy(self, data: List[list]) -> List[ProxyModel]: result = [] for item in data: result.append(ProxyModel(item[3], item[0], item[1], item[2])) return result
Application of real column
from website import ProxyFactory from browser.agent import useragent factory = ProxyFactory() headers = { 'user-agent': useragent.random() } ''' 66 Free agent network www = factory.create('http://www.66ip.cn/mo.php?sxb=&tqsl=100&port=&export=&ktip=&sxa=&submit=%CC%E1++%C8%A1&textarea=', 'gbk', headers=headers) ''' ''' //Xiaohuan HTTP proxy www = factory.create('https://ip.ihuan.me/',headers = headers) ''' ''' 89 Free agent http://www.89ip.cn/ www = factory.create('http://www.89ip.cn/',headers = headers) ''' ''' //Worry free agent http://www.data5u.com/ www = factory.create('http://www.data5u.com/',headers = headers) ''' ''' http://www.goubanjia.com/ //Whole network agent IP www = factory.create('http://www.goubanjia.com/',headers = headers) ''' ''' //Cloud agent http://www.ip3366.net/ www = factory.create('http://www.ip3366.net/','gbk',headers = headers) ''' ''' //Quick acting https://www.kuaidaili.com/free ''' www = factory.create('https://www.kuaidaili.com/free',headers = headers) data = www.run() print(data)
Baidu SkyDrive
Link: https://pan.baidu.com/s/1aNiuGpcDmgvUR2HmZKR99Q
Extraction code: 6u82