Bean crawler (get ID from play name)

Suppose you only know the list of play names ['northwest wolf of war', 'martial arts biography'], and want to crawl its Douban information. First of all, I need to know its Douban ID, so that I can better connect to the information page of the play.

 

1. Load package

# coding: utf-8
import urllib.request
import pandas as pd
import urllib.parse
import requests
import re
import matplotlib.pyplot as plt
from imageio import imread
from wordcloud import WordCloud
import jieba, codecs
from lxml import html
from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import random
import time
import os

2. Define function: find Douban ID by play name

Name: the name of the play, such as "martial arts biography"

The url of Wulin is https://movie.douban.com/j/subject_suggest?q=%E6%AD%A6%E6%9E%97%E5%A4%96%E4%BC%A0

def find_ID(name):   #Name is play name
    try:
        url1 = 'https://movie.douban.com/j/subject_suggest?q='
        url2 = urllib.parse.quote(name)  #The URL only allows a part of ASCII characters. Other characters (such as Chinese characters) do not conform to the standard. At this time, they need to be encoded.
        url = url1 + url2  #Generate a link for the play. The red part of the link above is the encoded name
        html = requests.get(url)  #Visit the link to get the content of the html page
        html = html.content.decode()  #Decoding html content to utf-8 format
        html_list = html.replace('\/','/')  #Convert all \ / in html to /, just for convenience (no change)
        html_list = html_list.split('},{')  #Extract each entry in the html page as an element of the list.

        #Define regular to extract the desired information from html (extract id according to title)
        str_title = '"title":"' + name + '"'  ##Match play name
        pattern_title = re.compile(str_title)

        str_id = '"id":"'+ '[0-9]*'   ##Match the id value of the play
        pattern_id = re.compile(str_id)

        #Extract the corresponding ID value from each item in the HTML list
        id_list = []  #ID storage list
        for l in html_list:  #Traverse the HTML list
            find_results_title = re.findall(pattern_title,l,flags=0)  #Find the item matching the play name
            if find_results_title != []:  #If there is an entry with title=name, that is, if there is a matching result
                find_results_id = re.findall(pattern_id,l,flags=0)  #Find the corresponding id from the matching item
                id_list.append(find_results_id)  #Store the found id value in id list

        #It is possible to match multiple IDS (possibly different plays with the same name). According to the number of generated IDS, match the play name with the generated ID, and match the two list s
        name_list = [name] * len(id_list)  

        #Modify the format of ID list to make it a standard list format
        id_list = str(id_list).replace('[','').replace(']','').replace("'",'').replace('"id":"','').replace(' ','')  
        id_list = id_list.split(',')

    except:  #If the above code fails to run normally (unable to visit the web page, etc.), output the unsuccessful play name.
        print('ERROR:',name)
    return name_list,id_list

3. Define cycle process: give a nondup [name] list

def generate_ID(nondup_name_list):  #Given play list (play name list)
    name_list = []
    id_list = []
    error_list = []
    for name in nondup_name_list:  #Traverse each play name in the play list
        try:
            item_name,item_id = find_ID(name)  #Execute the find? ID function to return the name ID pair of the play name
            name_list.extend(item_name)  #Store name
            id_list.extend(item_id)  #Store id
        except:  #If there is an error in the execution, the failed name will be saved in the error list
            error_list.extend(name)
        time.sleep(1 + float(random.randint(1, 100)) / 20)  #Prevent account blocking and random delay

    if '' in id_list:  #Remove the id of the exception
        id_list.remove('')

    if '' in error_list:  #Remove the id of the exception
        error_list.remove('')    

    print(id_list)
    print(name_list)

    if error_list != []:
        print('Not generated successfully ID The title of the play:',error_list)
    else:
        print('------------------------------All of them are successfully produced into Douban ID------------------------------------')
    return id_list,name_list

4. Execution process: given a play list without repetition

nondup_name_list = ['War northwest Wolf','The spread of Wulin']
id_list,name_list = generate_ID(nondup_name_list)

Keywords: Big Data ascii

Added by semsem on Fri, 13 Dec 2019 20:24:48 +0200