Climbing Enterprise Data Information in Anhui Province on Enterprise Search Website

In order to better query the relevant information of enterprises, I crawled the enterprises in Anhui Province. The problems encountered and the technologies used are as follows:

1. Problems encountered:

1 > Enterprise Check PC version data only shows the first 500 pages. In order to maximize the crawl site data, this crawl is crawled according to the city level, totally crawling 80,000 enterprise information in 16 urban areas of Anhui Province.

2 > When crawling website data, if the crawling speed is too fast, there will be manual verification function. In order to solve manual verification, and to avoid blockade, the IP proxy can directly replace IP proxy randomly. IP proxy can obtain free proxy account in "89 Free Agent" website. The address is: http://www.89ip.cn/ 30 proxy IP can be obtained at one time, if not enough.

The agent pool can be extracted many times, and then constructed. I tried it. The free agent of the website is much better than the free agent of the Western agent and the fast agent website, as follows:

    

2. Technology used:

1 > Request module: requests request, in order to avoid anti-crawling, using random proxy, and using fake_user ragent to generate user-agent randomly;

2 > Parsing libraries: using xpath and regular expressions

3 > Speed-up optimization: multi-threading is adopted, and the crawled data is saved once to avoid frequent IO disks.

3. The core code is as follows:

  

import requests
from lxml import etree
from queue import Queue
from threading import Thread
from fake_useragent import UserAgent
import csv
import os
import re
import random
import time
from ippools import ProxySpider
from proxy_ip import IP_LIST


class QichachaSpider:
    def __init__(self):
        self.url = 'https://www.qichacha.com/gongsi_area.html?prov={}&city={}&p={}'
        self.q = Queue()
        self.company_info = []
        self.headers = {
            'Host': 'www.qichacha.com',
            'Referer': 'https: // www.qichacha.com /',
            'X-Requested-With': 'XMLHttpRequest'
        }

    # random User-Agent
    def random_ua(self):
        ua = UserAgent()
        return ua.random

    # Random IP
    def random_proxy(self):
        proxy_list = ProxySpider().get_training_ip('https://www.qichacha.com/')
        return proxy_list

    # Climb the target into the queue
    def put_url(self):
        self.headers['User-Agent'] = self.random_ua()
        url = 'https://www.qichacha.com/'
        html = requests.get(url, headers=self.headers).content.decode('utf-8', 'ignore')
        parse_html = etree.HTML(html)
        r_list = parse_html.xpath('//div[@class="areacom"]/div[2]/div[2]/a/@href')
        for r in r_list:
            link = r.split('_')[1:]
            for i in range(1, 501):
                url = self.url.format(link[0], link[1], i)
                print(url)
                self.q.put(url)

    # Getting first-level page data
    def get_data(self):
        while True:
            if not self.q.empty():
                url = self.q.get()
                self.headers['User-Agent'] = self.random_ua()
                # proxies = self.random_proxy()
                proxies = random.choice(IP_LIST)
                try:
                    html = requests.get(url, headers=self.headers, proxies=proxies, timeout=3).content.decode('utf-8',
                                                                                                              'ignore')
                    # html = requests.get(url, headers=self.headers).content.decode('utf-8', 'ignore')
                    # time.sleep(random.uniform(0.5, 1.5))
                    parse_html = etree.HTML(html)
                    company_list = parse_html.xpath('//table[@class="m_srchList"]/tbody/tr')
                    if company_list is not None:
                        for company in company_list:
                            try:
                                company_name = company.xpath('./td[2]/a/text()')[0].strip()
                                company_link = 'https://www.qichacha.com' + company.xpath('./td[2]/a/@href')[0].strip()
                                company_type, company_industry, company_business_scope = self.get_company_info(
                                    company_link)
                                company_person = company.xpath('./td[2]/p[1]/a/text()')[0].strip()
                                company_money = company.xpath('./td[2]/p[1]/span[1]/text()')[0].split(': ')[-1].strip()
                                company_time = company.xpath('./td[2]/p[1]/span[2]/text()')[0].split(': ')[-1].strip()
                                company_email = company.xpath('./td[2]/p[2]/text()')[0].split(': ')[-1].strip()
                                company_phone = company.xpath('td[2]/p[2]/span/text()')[0].split(': ')[-1].strip()
                                company_address = company.xpath('td[2]/p[3]/text()')[0].split(': ')[-1].strip()
                                company_status = company.xpath('td[3]/span/text()')[0].strip()
                                company_dict = {
                                    'Corporate name': company_name,
                                    'Corporate Links': company_link,
                                    'Type of company': company_type,
                                    'B.': company_industry,
                                    'Scope of Business': company_business_scope,
                                    'Corporate Legal Person': company_person,
                                    'registered capital': company_money,
                                    'Registration time': company_time,
                                    'mailbox': company_email,
                                    'Telephone': company_phone,
                                    'address': company_address,
                                    'Does it survive?': company_status,
                                }
                                print(company_dict)
                                # self.company_info.append(
                                #     (company_name, company_link, company_type, company_industry, company_business_scope,
                                #      company_person, company_money, company_time, company_email, company_phone,
                                #      company_address, company_status))
                                info_list = [company_name, company_link, company_type, company_industry,
                                             company_business_scope, company_person, company_money, company_time,
                                             company_email, company_phone, company_address, company_status]
                                self.save_data(info_list)

                            except:
                                with open('./bad.csv', 'a', encoding='utf-8', newline='') as f:
                                    writer = csv.writer(f)
                                    writer.writerow(url)
                                continue
                except:
                    self.q.put(url)

            else:
                break

    # Getting secondary page data
    def get_company_info(self, company_link):
        headers = {'User-Agent': UserAgent().random}
        html = requests.get(company_link, headers=headers, proxies=random.choice(IP_LIST), timeout=3).content.decode(
            'utf-8', 'ignore')
        while True:
            if 'Enterprise type' not in html:
                html = requests.get(company_link, headers=headers, proxies=random.choice(IP_LIST),
                                    timeout=3).content.decode(
                    'utf-8', 'ignore')
            else:
                break
        try:
            company_type = re.findall(r'Enterprise type</td> <td class="">(.*?)</td>', html, re.S)[0].strip()
            company_industry = re.findall(r'B.</td> <td class="">(.*?)</td>', html, re.S)[0].strip()
            company_business_scope = re.findall(r'Scope of Business.*?"3">(.*?)</td>', html, re.S)[0].strip()
            return company_type, company_industry, company_business_scope
        except:
            return 'nothing', 'nothing', 'nothing'

    # Save data
    def save_data(self, info):
        with open('./1111.csv', 'a', encoding='utf-8', newline='') as f:
            writer = csv.writer(f)
            writer.writerow(info)

    def main(self):
        if os.path.exists('./1111.csv'):
            os.remove('./1111.csv')
            with open('./1111.csv', 'a', encoding='utf-8', newline='') as f:
                writer = csv.writer(f)
                writer.writerow(
                    ['Corporate name', 'Corporate Links', 'Type of company', 'B.', 'Scope of Business', 'Corporate Legal Person', 'registered capital', 'Registration time', 'mailbox', 'Telephone', 'address', 'Does it survive?'])
        self.put_url()
        t_list = []
        for i in range(0, 10):
            t = Thread(target=self.get_data)
            t_list.append(t)
            t.start()

        for j in t_list:
            j.join()


if __name__ == "__main__":
    spider = QichachaSpider()
    spider.main()

Keywords: PHP encoding

Added by cricher on Sat, 27 Jul 2019 13:05:15 +0300