Suppose you only know the list of play names ['northwest wolf of war', 'martial arts biography'], and want to crawl its Douban information. First of all, I need to know its Douban ID, so that I can better connect to the information page of the play.
1. Load package
# coding: utf-8 import urllib.request import pandas as pd import urllib.parse import requests import re import matplotlib.pyplot as plt from imageio import imread from wordcloud import WordCloud import jieba, codecs from lxml import html from bs4 import BeautifulSoup from bs4 import UnicodeDammit import random import time import os
2. Define function: find Douban ID by play name
Name: the name of the play, such as "martial arts biography"
The url of Wulin is https://movie.douban.com/j/subject_suggest?q=%E6%AD%A6%E6%9E%97%E5%A4%96%E4%BC%A0
def find_ID(name): #Name is play name try: url1 = 'https://movie.douban.com/j/subject_suggest?q=' url2 = urllib.parse.quote(name) #The URL only allows a part of ASCII characters. Other characters (such as Chinese characters) do not conform to the standard. At this time, they need to be encoded. url = url1 + url2 #Generate a link for the play. The red part of the link above is the encoded name html = requests.get(url) #Visit the link to get the content of the html page html = html.content.decode() #Decoding html content to utf-8 format html_list = html.replace('\/','/') #Convert all \ / in html to /, just for convenience (no change) html_list = html_list.split('},{') #Extract each entry in the html page as an element of the list. #Define regular to extract the desired information from html (extract id according to title) str_title = '"title":"' + name + '"' ##Match play name pattern_title = re.compile(str_title) str_id = '"id":"'+ '[0-9]*' ##Match the id value of the play pattern_id = re.compile(str_id) #Extract the corresponding ID value from each item in the HTML list id_list = [] #ID storage list for l in html_list: #Traverse the HTML list find_results_title = re.findall(pattern_title,l,flags=0) #Find the item matching the play name if find_results_title != []: #If there is an entry with title=name, that is, if there is a matching result find_results_id = re.findall(pattern_id,l,flags=0) #Find the corresponding id from the matching item id_list.append(find_results_id) #Store the found id value in id list #It is possible to match multiple IDS (possibly different plays with the same name). According to the number of generated IDS, match the play name with the generated ID, and match the two list s name_list = [name] * len(id_list) #Modify the format of ID list to make it a standard list format id_list = str(id_list).replace('[','').replace(']','').replace("'",'').replace('"id":"','').replace(' ','') id_list = id_list.split(',') except: #If the above code fails to run normally (unable to visit the web page, etc.), output the unsuccessful play name. print('ERROR:',name) return name_list,id_list
3. Define cycle process: give a nondup [name] list
def generate_ID(nondup_name_list): #Given play list (play name list) name_list = [] id_list = [] error_list = [] for name in nondup_name_list: #Traverse each play name in the play list try: item_name,item_id = find_ID(name) #Execute the find? ID function to return the name ID pair of the play name name_list.extend(item_name) #Store name id_list.extend(item_id) #Store id except: #If there is an error in the execution, the failed name will be saved in the error list error_list.extend(name) time.sleep(1 + float(random.randint(1, 100)) / 20) #Prevent account blocking and random delay if '' in id_list: #Remove the id of the exception id_list.remove('') if '' in error_list: #Remove the id of the exception error_list.remove('') print(id_list) print(name_list) if error_list != []: print('Not generated successfully ID The title of the play:',error_list) else: print('------------------------------All of them are successfully produced into Douban ID------------------------------------') return id_list,name_list
4. Execution process: given a play list without repetition
nondup_name_list = ['War northwest Wolf','The spread of Wulin'] id_list,name_list = generate_ID(nondup_name_list)