Learn crawler the next day: continue to climb the company's internal address book

import requests
from bs4 import BeautifulSoup
import re

#First, get headers from chrome, including user agent and cookie
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
    'Cookie': 'tips=1...
}

#Get information about an individual employee
def single_agent(agent_id,agent_py):
    url = r"http://www.sh.ctc.com/CompanyAddressListNew/newDeptShow.do?method=doViewLayer&id="+agent_id+"&isVirtal=no&zygzh="+agent_py+"&ZDWID=0331500020&virtualType="
    r = requests.get(url=url, headers=headers)
    r.encoding = 'utf-8'
    soup = BeautifulSoup(r.text, 'lxml')
    staff_detail = []
    for td in soup.select('td'):
        staff_detail.append(td.text.strip())
    #Remove unwanted elements from the list
    del staff_detail[2]
    del staff_detail[7]
    del staff_detail[10:14]
    #Export the acquired employee information to a file
    with open('Pudong address book.txt', 'a', encoding='utf-8') as file:
        file.write(','.join(staff_detail))
        file.write('\n')

#Get the maximum number of pages in this department's address book
url = r"http://www.sh.ctc.com/CompanyAddressListNew/newDeptShow.do?method=doSearch&ZDWID=0331500020&currentPage=1&orderIndex=&orderSign=1&str=all&isVirtal=no"
r = requests.get(url=url, headers=headers)
r.encoding = 'utf-8'
soup = BeautifulSoup(r.text, 'lxml')
totalPage = int(soup.find(name='input',attrs={'name':'totalPage'})['value'])
#Get the total number of people in this department by regular expression
temp_renshu = soup.find(name='b',attrs={'f-fam1'}).string
renshu = int(re.search('\D\D(\d+)\D',temp_renshu).group(1))
# print("the department address book has {} pages, {} people". format(totalPage,renshu))

#Get the id and name Pinyin of each employee on each page
for j in range(totalPage+1):
    url = r"http://www.sh.ctc.com/CompanyAddressListNew/newDeptShow.do?method=doSearch&ZDWID=0331500020&currentPage="+str(j)+"&orderIndex=&orderSign=1&str=all&isVirtal=no"
    r = requests.get(url=url, headers=headers)
    r.encoding = 'utf-8'
    soup = BeautifulSoup(r.text, 'lxml')
    agent_clickView = soup.find_all(name='div',attrs={'style':'cursor: pointer;'})
    for i in range(len(agent_clickView)):
        clickView = str(agent_clickView[i]).split("'")
        agent_id = clickView[1]
        agent_py = clickView[3]
        single_agent(agent_id,agent_py)

Harvest: further optimized yesterday's code, at the same time, employee information was expanded from the original main menu to the sub menu acquisition (specifically, a new window was opened to obtain more detailed data), and employee id and other keywords were obtained from the main menu
At the same time, it strengthens the basic learning: deleting, adding and exporting the list, searching the fields and obtaining the data of the soup, etc
Tomorrow's plan: continue to climb the company's internal address book, get pictures of each employee and export them

Keywords: Web Development encoding Windows

Added by mb81 on Wed, 11 Dec 2019 02:41:13 +0200