import requests from bs4 import BeautifulSoup import re #First, get headers from chrome, including user agent and cookie headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'Cookie': 'tips=1... } #Get information about an individual employee def single_agent(agent_id,agent_py): url = r"http://www.sh.ctc.com/CompanyAddressListNew/newDeptShow.do?method=doViewLayer&id="+agent_id+"&isVirtal=no&zygzh="+agent_py+"&ZDWID=0331500020&virtualType=" r = requests.get(url=url, headers=headers) r.encoding = 'utf-8' soup = BeautifulSoup(r.text, 'lxml') staff_detail = [] for td in soup.select('td'): staff_detail.append(td.text.strip()) #Remove unwanted elements from the list del staff_detail[2] del staff_detail[7] del staff_detail[10:14] #Export the acquired employee information to a file with open('Pudong address book.txt', 'a', encoding='utf-8') as file: file.write(','.join(staff_detail)) file.write('\n') #Get the maximum number of pages in this department's address book url = r"http://www.sh.ctc.com/CompanyAddressListNew/newDeptShow.do?method=doSearch&ZDWID=0331500020¤tPage=1&orderIndex=&orderSign=1&str=all&isVirtal=no" r = requests.get(url=url, headers=headers) r.encoding = 'utf-8' soup = BeautifulSoup(r.text, 'lxml') totalPage = int(soup.find(name='input',attrs={'name':'totalPage'})['value']) #Get the total number of people in this department by regular expression temp_renshu = soup.find(name='b',attrs={'f-fam1'}).string renshu = int(re.search('\D\D(\d+)\D',temp_renshu).group(1)) # print("the department address book has {} pages, {} people". format(totalPage,renshu)) #Get the id and name Pinyin of each employee on each page for j in range(totalPage+1): url = r"http://www.sh.ctc.com/CompanyAddressListNew/newDeptShow.do?method=doSearch&ZDWID=0331500020¤tPage="+str(j)+"&orderIndex=&orderSign=1&str=all&isVirtal=no" r = requests.get(url=url, headers=headers) r.encoding = 'utf-8' soup = BeautifulSoup(r.text, 'lxml') agent_clickView = soup.find_all(name='div',attrs={'style':'cursor: pointer;'}) for i in range(len(agent_clickView)): clickView = str(agent_clickView[i]).split("'") agent_id = clickView[1] agent_py = clickView[3] single_agent(agent_id,agent_py)
Harvest: further optimized yesterday's code, at the same time, employee information was expanded from the original main menu to the sub menu acquisition (specifically, a new window was opened to obtain more detailed data), and employee id and other keywords were obtained from the main menu
At the same time, it strengthens the basic learning: deleting, adding and exporting the list, searching the fields and obtaining the data of the soup, etc
Tomorrow's plan: continue to climb the company's internal address book, get pictures of each employee and export them