Climb to the top of Douban movies (score)

First step, we need to crawl the information of Douban,

Second, we need to save the information that we crawled down,

Next we import the module,

Request module of HTTP request and xlwt module of Excel file processing
There is also a real time module, we can not crawl too fast, otherwise it will be recognized by the website.

import requests
import xlwt
import time

Next, create an Excel table and name it:

#Create Excel
excel1 = xlwt.Workbook()
#Sheet label sheet1 renamed to Douban
sheet1 = excel1.add_sheet('Bean paste', cell_overwrite_ok=True)
#The first parameter 0 represents the first row, the second parameter represents the first column, the second column and the third column of the first row
sheet1.write(0, 0, 'Movie title')
sheet1.write(0, 1, 'region/Country')
sheet1.write(0, 2, 'particular year')
sheet1.write(0, 3, 'score')
sheet1.write(0, 4, 'Number of people assessed')

Now get the data,
Analyze the page of Douban movie and write the useful information we get into the code,

#Define a dictionary data, the start page number is 0, we change it to pn, which is convenient to cycle other pages
#By browsing other pages, we know that the number of pn pages is not represented by 1, 2, 3, 4, 5, but by + 20: 0, 20, 40,
data  = {
            'start': pn,
            'limit': '20'
        }


headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5702.400 QQBrowser/10.2.1893.400',
            'Referer': 'https://movie.douban.com/typerank?type_name=%E5%89%A7%E6%83%85&type=11&interval_id=100:90&action=',
            'Cookie':'bid=6nS16u78ysk; douban-fav-remind=1; __utmz=30149280.1536302200.2.2.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ll="118371"; _vwo_uuid_v2=D1979D38657B6548A1BB551C1064F3530|bf98aa297a7ebd3c5226eab14e60b262; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.1052346626.1533889879.1536302200.1538099719.3; __utmc=30149280; __utma=223695111.1669584758.1538099719.1538099719.1538099719.1; __utmb=223695111.0.10.1538099719; __utmc=223695111; __utmz=223695111.1538099719.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ps=y; dbcl2="185126615:6R0mlwkYAeI"; ck=ga0K; push_noty_num=0; push_doumail_num=0; __yadk_uid=KHFgBWW1h7s9vtx9sYMoQeaaZ8e8CNU1; __utmt=1; __utmv=30149280.18512; __utmb=30149280.8.10.1538099719; _pk_id.100001.4cf6=5aa5204a625172c4.1538099719.1.1538100311.1538099719.'
        }
        Requesturl = 'https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start={}&limit=20'.format(pn)

        res = requests.get(Requesturl, data=data, headers=headers)
        result = res.json()

So we have a page full of movie information
We need to extract information. As long as we need it, we need to extract it and write it into Excel,
Each page has 20 movie information. for 20 times, we get the information of each movie.

 for i in range(20):
            try:


                job_name = result[i]['title']
                job_name1 = result[i]['regions'][0]
                job_name2 = result[i]['release_date']
                job_name3 = result[i]['score']
                job_name4 = result[i]['vote_count']
                print(job_name)



                sheet1.write(n, 0, job_name)
                sheet1.write(n, 1, job_name1)
                sheet1.write(n, 2, job_name2)
                sheet1.write(n, 3, job_name3)
                sheet1.write(n, 4, job_name4)
                n += 1
                time.sleep(1)  # One time for climbing for 1s
                excel1.save('Douban high score Movie 2.xls')  # Create excel table named ''
            except:
                pass

Finally, we need to traverse the data of each page, but we don't know how many pages he has,,
I've written 10000 pages here. I don't think there are so many.

for pn in range(0,20000,20):

	#Put the above code here,
	#Now this is a cycle that stops for a second at a time. It can't be too fast.
     time.sleep(1)
     #How many pages is the print saved to now,
     print('--------------This is the first.%s page------------------'%(pn/20))
	
	

All codes are as follows,

import requests
import xlwt
import time


excel1 = xlwt.Workbook()
sheet1 = excel1.add_sheet('Bean paste', cell_overwrite_ok=True)
sheet1.write(0, 0, 'Movie title')
sheet1.write(0, 1, 'region/Country')
sheet1.write(0, 2, 'particular year')
sheet1.write(0, 3, 'score')
sheet1.write(0, 4, 'Number of people assessed')

n = 1
for pn in range(0,20000,20):

        data  = {
            'start': pn,
            'limit': '20'
        }
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.5702.400 QQBrowser/10.2.1893.400',
            'Referer': 'https://movie.douban.com/typerank?type_name=%E5%89%A7%E6%83%85&type=11&interval_id=100:90&action=',
            'Cookie':'bid=6nS16u78ysk; douban-fav-remind=1; __utmz=30149280.1536302200.2.2.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ll="118371"; _vwo_uuid_v2=D1979D38657B6548A1BB551C1064F3530|bf98aa297a7ebd3c5226eab14e60b262; _pk_ses.100001.4cf6=*; ap_v=0,6.0; __utma=30149280.1052346626.1533889879.1536302200.1538099719.3; __utmc=30149280; __utma=223695111.1669584758.1538099719.1538099719.1538099719.1; __utmb=223695111.0.10.1538099719; __utmc=223695111; __utmz=223695111.1538099719.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); ps=y; dbcl2="185126615:6R0mlwkYAeI"; ck=ga0K; push_noty_num=0; push_doumail_num=0; __yadk_uid=KHFgBWW1h7s9vtx9sYMoQeaaZ8e8CNU1; __utmt=1; __utmv=30149280.18512; __utmb=30149280.8.10.1538099719; _pk_id.100001.4cf6=5aa5204a625172c4.1538099719.1.1538100311.1538099719.'
        }
        Requesturl = 'https://movie.douban.com/j/chart/top_list?type=11&interval_id=100%3A90&action=&start={}&limit=20'.format(pn)

        res = requests.get(Requesturl, data=data, headers=headers,proxies=proxies)
        result = res.json()


        for i in range(20):
            try:


                job_name = result[i]['title']
                job_name1 = result[i]['regions'][0]
                job_name2 = result[i]['release_date']
                job_name3 = result[i]['score']
                job_name4 = result[i]['vote_count']
                print(job_name)



                sheet1.write(n, 0, job_name)
                sheet1.write(n, 1, job_name1)
                sheet1.write(n, 2, job_name2)
                sheet1.write(n, 3, job_name3)
                sheet1.write(n, 4, job_name4)
                n += 1
                time.sleep(1)  # One time for climbing for 1s
                excel1.save('Douban high score movie.xls')  # Create excel table named ''
            except:
                pass
        time.sleep(1)


        print('--------------This is the first.%s page------------------'%(pn/20))




Keywords: Excel Windows JSON

Added by rashpal on Mon, 23 Dec 2019 23:17:00 +0200