Crawling Douban movie information

Yesterday, I wrote a little reptile, which crawled the film information of 2017 mainland China on Douban. The website is Douban film , crawled the movie name, director, screenwriter, star, type, release time, length, score and link, and saved them in MongoDB.

The IP address of the machine used at the beginning, without proxy IP, can't receive data after requesting more than a dozen Web pages, reporting HTTP error 302, and then using a browser to open the web page to try, and found that the browser is also 302...


But I am not afraid, I have proxy IP, ha ha ha! See my previous essay for details: Crawling agent IP.
After using the proxy IP, you can continue to receive data, but there are still 302 errors in the middle. It's OK. It's OK to use another proxy IP to request a new request. If you can't do it again, please do it again. If you can't do it again, please do it again. If you can't do it again, please do it again...


Please attach some codes below.

1. Crawler file
import scrapy
import json
from douban.items import DoubanItem


parse_url = "https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=%E7%94%B5%E5%BD%B1&start={}&countries=%E4%B8%AD%E5%9B%BD%E5%A4%A7%E9%99%86&year_range=2017,2017"


class Cn2017Spider(scrapy.Spider):
    name = 'cn2017'
    allowed_domains = ['douban.com']
    start_urls = ['https://movie.douban.com/j/new_search_subjects?sort=U&range=0,10&tags=%E7%94%B5%E5%BD%B1&start=0&countries=%E4%B8%AD%E5%9B%BD%E5%A4%A7%E9%99%86&year_range=2017,2017']


    def parse(self, response):
        data = json.loads(response.body.decode())
        if data is not None:
            for film in data["data"]:
                print(film["url"])
                item = DoubanItem()
                item["url"] = film["url"]
                yield scrapy.Request(
                    film["url"],
                    callback=self.get_detail_content,
                    meta={"item": item}
                )

        for page in range(20,3200,20):
            yield scrapy.Request(
                parse_url.format(page),
                callback=self.parse
            )


    def get_detail_content(self,response):
        item = response.meta["item"]
        item["film_name"] = response.xpath("//div[@id='content']//span[@property='v:itemreviewed']/text()").extract_first()
        item["director"] = response.xpath("//div[@id='info']/span[1]/span[2]/a/text()").extract_first()
        item["scriptwriter"] = response.xpath("///div[@id='info']/span[2]/span[2]/a/text()").extract()
        item["starring"] = response.xpath("//div[@id='info']/span[3]/span[2]/a[position()<6]/text()").extract()
        item["type"] = response.xpath("//div[@id='info']/span[@property='v:genre']/text()").extract()
        item["release_date"] = response.xpath("//div[@id='info']/span[@property='v:initialReleaseDate']/text()").extract()
        item["running_time"] = response.xpath("//div[@id='info']/span[@property='v:runtime']/@content").extract_first()
        item["score"] = response.xpath("//div[@class='rating_self clearfix']/strong/text()").extract_first()
        # print(item)
        if item["film_name"] is None:
            # print("*" * 100)
            yield scrapy.Request(
                item["url"],
                callback=self.get_detail_content,
                meta={"item": item},
                dont_filter=True
            )
        else:
            yield item
2.items.py file
import scrapy


class DoubanItem(scrapy.Item):
    #Movie title
    film_name = scrapy.Field()
    #director
    director = scrapy.Field()
    #Screenwriter
    scriptwriter = scrapy.Field()
    #To star
    starring = scrapy.Field()
    #type
    type = scrapy.Field()
    #Release time
    release_date = scrapy.Field()
    #Film length
    running_time = scrapy.Field()
    #score
    score = scrapy.Field()
    #link
    url = scrapy.Field()
3. Middlewars.py file
from douban.settings import USER_AGENT_LIST
import random
import pandas as pd


class UserAgentMiddleware(object):
    def process_request(self, request, spider):
        user_agent = random.choice(USER_AGENT_LIST)
        request.headers["User-Agent"] = user_agent
        return None


class ProxyMiddleware(object):
    def process_request(self, request, spider):
        # Called for each request that goes through the downloader
        # middleware.
        ip_df = pd.read_csv(r"C:\Users\Administrator\Desktop\douban\douban\ip.csv")
        ip = random.choice(ip_df.loc[:, "ip"])
        request.meta["proxy"] = "http://" + ip
        return None
4.pipelines.py file
from pymongo import MongoClient

client = MongoClient()
collection = client["test"]["douban"]

class DoubanPipeline(object):
    def process_item(self, item, spider):
        collection.insert(dict(item))
5.settings.py file
DOWNLOADER_MIDDLEWARES = {
    'douban.middlewares.UserAgentMiddleware': 543,
    'douban.middlewares.ProxyMiddleware': 544,
}

ITEM_PIPELINES = {
   'douban.pipelines.DoubanPipeline': 300,
}

ROBOTSTXT_OBEY = False
DOWNLOAD_TIMEOUT = 10
RETRY_ENABLED = True
RETRY_TIMES = 10

The program runs for 1 hour, 20 minutes, 21.473772 seconds and captures 2986 pieces of data.

Last,
Or happy duck every day!

Keywords: Python JSON MongoDB

Added by LHBraun on Sun, 01 Dec 2019 16:55:20 +0200