Python+selenium crawls job information recruited by Zhilian

The whole crawler is based on selenium and Python, running the packages needed

 1 mysql,matplotlib,selenium 

Need to install selenium Firefox browser driver, Baidu search.

The whole crawler is modular. Functions and classes with different functions are placed in different files. Finally, constants that need to be configured are placed in constant.py.

Project address: GitHub (click through)

The main thread of the whole crawler is the Main.py file. After setting constant.py, it can run Main.py directly.

Analysis from the Main Line

Main.py

 1 # /bin/python
 2 # author:leozhao
 3 # author@email: dhzzy88@163.com
 4 
 5 """
 6 This is the main program of the whole crawler system.
 7 """
 8 import numpy as np
 9 
10 import dataFactory
11 import plotpy
12 import sqlDeal
13 import zhilian
14 from Constant import JOB_KEY
15 
16 #
17 # Start the crawler program
18 zhilian.spidefmain(JOB_KEY)
19 
20 """
21  Visualization of data after crawling
22 """
23 # Reading crawled data from a database
24 # First you get the tuple name,salray, demand,welfare.
25 
26 value = sqlDeal.sqlselect()
27 # Wage cap, floor, average
28 updata = np.array([], dtype=np.int)
29 downdata = np.array([], dtype=np.int)
30 average = np.array([], dtype=np.int)
31 for item in value:
32     salray = dataFactory.SarayToInt(item[1])
33     salray.slove()
34     updata = np.append(updata, salray.up)
35     downdata = np.append(downdata, salray.down)
36     average = np.append(average, (salray.up + salray.down) / 2)
37 
38 # Upper and lower wage limits
39 average.sort()
40 
41 # Matching city information is not yet realized
42 
43 # statistical information
44 # Both graphics are loaded for easy viewing
45 plotpy.plotl(average)
46 plotpy.plots(average)
47 
48 print(average, average.sum())
49 print("average wage:", average.sum() / len(average))
50 print("Highest:", average.max())
51 print("minimum", average.min())
52 print("Number of Posts", len(average))
53 
54 # Drawing

Basically, it is organized by the whole execution process of the crawler.

Import zhilian.py from the function file

 1 # /bin/python
 2 # author:leo
 3 # author@email : dhzzy88@163.com
 4 from selenium import webdriver
 5 from selenium.webdriver.common.by import By
 6 from selenium.webdriver.common.keys import Keys
 7 from selenium.webdriver.support import expected_conditions as EC
 8 from selenium.webdriver.support.ui import WebDriverWait
 9 
10 import sqlDeal
11 from Constant import PAGE_NUMBER
12 
13 
14 def init(key="JAVA"):
15     # The home page of Zhaolian Recruitment Search keywords, initialize to the collection page
16     url = "https://www.zhaopin.com/"
17     opt = webdriver.FirefoxOptions()
18     opt.set_headless()           #Setting Headless Browser Mode
19     driver = webdriver.Firefox(options=opt)
20     driver.get(url)
21     driver.find_element_by_class_name("zp-search-input").send_keys(key)
22     # driver.find_element_by_class_name(".zp-search-btn zp-blue-button").click()
23     driver.find_element_by_class_name("zp-search-input").send_keys(Keys.ENTER)
24     import time
25     time.sleep(2)
26     all = driver.window_handles
27     driver.switch_to_window(all[1])
28     url = driver.current_url
29     return url
30 
31 
32 class ZhiLian:
33 
34     def __init__(self, key='JAVA'):
35         # default key:JAVA
36         indexurl = init(key)
37         self.url = indexurl
38         self.opt = webdriver.FirefoxOptions()
39         self.opt.set_headless()
40         self.driver = webdriver.Firefox(options=self.opt)
41         self.driver.get(self.url)
42 
43     def job_info(self):
44 
45         # Extracting Work Information     You can load the details page
46         job_names = self.driver.find_elements_by_class_name("job_title")
47         job_sarays = self.driver.find_elements_by_class_name("job_saray")
48         job_demands = self.driver.find_elements_by_class_name("job_demand")
49         job_welfares = self.driver.find_elements_by_class_name("job_welfare")
50         for job_name, job_saray, job_demand, job_welfare in zip(job_names, job_sarays, job_demands, job_welfares):
51             sqlDeal.sqldeal(str(job_name.text), str(job_saray.text), str(job_demand.text), str(job_welfare.text))
52 
53         # Waiting for Page Loading
54         print("Waiting for Page Loading")
55         WebDriverWait(self.driver, 10, ).until(
56             EC.presence_of_element_located((By.CLASS_NAME, "job_title"))
57         )
58 
59     def page_next(self):
60         try:
61             self.driver.find_elements_by_class_name("btn btn-pager").click()
62         except:
63             return None
64         self.url = self.driver.current_url
65         return self.driver.current_url
66 
67 
68 def spidefmain(key="JAVA"):
69     ZHi = ZhiLian(key)
70     ZHi.job_info()
71     # Set a number of pages to crawl
72     page_count = 0
73     while True:
74         ZHi.job_info()
75         ZHi.job_info()
76         page_count += 1
77         if page_count == PAGE_NUMBER:
78             break
79     # Remove objects after collection
80     del ZHi
81 
82 
83 if __name__ == '__main__':
84 spidefmain("python")

This is a program that calls selenium to simulate the browser loading dynamic pages. The core of the whole crawler is around this file.

After each page of information is crawled, the parsed data is stored in the database, and the definition of the database processing function is placed in another file, where only the logic of loading and extracting information is handled.

Store data in local mysql database

 1 # /bin/python
 2 # author:leozhao
 3 # author@email :dhzzy88@163.com
 4 
 5 import mysql.connector
 6 
 7 from Constant import SELECT
 8 from Constant import SQL_USER
 9 from Constant import database
10 from Constant import password
11 
12 
13 def sqldeal(job_name, job_salray, job_demand, job_welfare):
14     conn = mysql.connector.connect(user=SQL_USER, password=password, database=database, use_unicode=True)
15     cursor = conn.cursor()
16     infostring = "insert into zhilian value('%s','%s','%s','%s')" % (
17         job_name, job_salray, job_demand, job_welfare) + ";"
18     cursor.execute(infostring)
19     conn.commit()
20     conn.close()
21 
22 
23 def sqlselect():
24     conn = mysql.connector.connect(user=SQL_USER, password=password, database=database, use_unicode=True)
25     print("Connect database to read information")
26     cursor = conn.cursor()
27 
28     cursor.execute(SELECT)
29     values = cursor.fetchall()
30     conn.commit()
31     conn.close()
32     return values

Two functions

The first one is responsible for storing data.

The second is responsible for reading data.

Processing the data in another class after reading the data

Information like 10K-20K, for example, prepares for visualization

# /bin/python
# author:leozhao
# author@email : dhzzy88@163.com

import matplotlib.pyplot as plt
import numpy as np

from Constant import JOB_KEY


# Line Graph


def plotl(dta):
    dta.sort()
    print("dta", [dta])
    num = len(dta)
    x = np.linspace(0, num - 1, num)
    print([int(da) for da in dta])
    print(len(dta))
    plt.figure()
    line = plt.plot(x, [sum(dta) / num for i in range(num)], dta)

    # plt.xlim(0, 250)
    plt.title(JOB_KEY + 'Job_Info')
    plt.xlabel(JOB_KEY + 'Job_Salray')
    plt.ylabel('JobNumbers')
    plt.show()


# Bar chart


def plots(dta):
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.hist(dta, bins=15)
    plt.title(JOB_KEY + 'Job_Info')
    plt.xlabel(JOB_KEY + 'Job_Salray')
    plt.ylabel('JobNumbers')
plt.show()

Finally, the data is put into the drawing program to draw.

Finally, the relevant data are calculated.

In the crawling process, the data is stored in the database in time to reduce the proportion of virtual machine memory.

Put the data results below.

Here's the salary survey for financial jobs.

Here's the salary survey for Materials Science

Blue is the average wage.

Attention should be paid to the requirement of doctoral degree and master's degree above the average.

Specific data processing has no time to do, there is time to do.

Keywords: Python Database Selenium MySQL

Added by ludwig on Sat, 18 May 2019 09:49:29 +0300