Douban Film TOP250 and Book TOP250 Reptiles

Recently, I started to play Python. When I learned about crawlers, I had a whim. I crawled the toP250 movie and the book TOP250. Here I record the process of my playing.

Movie TOP250 Reptiles

import requests
from bs4 import BeautifulSoup
import time


def getlist(list_url):
    time.sleep(2)
    res = requests.get(list_url)
    soup = BeautifulSoup(res.text, 'html.parser')
    movie_list = soup.select('.grid_view li')
    for m in movie_list:
        rank = m.select('em')[0].text
        score = m.select('.rating_num')[0].text
        title = m.select('.title')[0].text
        direct = m.select('.info .bd p')[0].text.strip()
        actor = '\n To star:'.join(direct.split('   To star:'))
        director = 'years:'.join(actor.split('                           '))
        if m.select('.inq'):
            comments = m.select('.inq')[0].text.strip()
        else:
            comments = 'None'
        movie.append(
            'ranking: ' + rank + '\n'
            + 'score: ' + score + '\n'
            + 'title: ' + title + '\n'
            + director + '\n'
            + 'comment: ' + comments + '\n'
            + '\n')
    if soup.select('.next a'):
        asoup = soup.select('.next a')[0]['href']
        next_page = seed_url + asoup
        getlist(next_page)
    else:
        print('End')
    return movie


def write(movies):
    with open('movie.txt', 'w', encoding='utf8') as m:
        for a in movies:
            m.write(a)


def main():
    write(getlist(seed_url))
    pass


if __name__ == '__main__':
    seed_url = 'https://movie.douban.com/top250'
    movie = []
    main()

Books TOP250 Reptiles

import bs4
import requests
import re
from bs4 import BeautifulSoup
from operator import itemgetter


def getHtmlText(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return ""


def parserText(text, book_list):
    soup = BeautifulSoup(text, 'html.parser')
    for table in soup('table', {'width': '100%'}):
        if isinstance(table, bs4.element.Tag):
            tds = table.find('tr')('td')
            divs = tds[1]('div')
            content = {}
            for div in divs:
                if isinstance(div, bs4.element.Tag):
                    if div.find('a'):
                        name = div.find('a').attrs['title']
                        content.update({"Title": name})
                    if div.select('.rating_nums'):
                        score = div.select('.rating_nums')[0].text
                        content.update({"score": score})
                    if div.select('.pl'):
                        people_num = div.select('.pl')[0].text
                        regex = re.compile(r'[\d]{1,10}')
                        content.update({"Number of people assessed": regex.findall(people_num)[0]})

            ps = tds[1]('p')
            for p in ps:
                if isinstance(p, bs4.element.Tag):
                    if p.attrs['class'][0] == 'quote':
                        description = p.find('span').string
                        content.update({"introduce": description})
                    if p.attrs['class'][0] == 'pl':
                        author = p.string
                        content.update({"Author information": author})

            book_list.append(content)

    next_books = soup.find('span', {'class': 'next'})
    if next_books.find('a'):
        a = next_books.find('a').attrs['href']
        text = getHtmlText(a)
        parserText(text, books)

    return book_list


def sortedBookTop250(book_list):
    tmp = sorted(book_list, key=itemgetter('score'), reverse=True)
    for i in range(len(tmp)):
        tmp[i].update({"ranking": i + 1})
    return tmp


def writeToFile(book_list):
    with open('good_books.txt', 'w', encoding='utf8') as book_file:
        for book in book_list:
            for key, value in book.items():
                book_file.write(f'{key}:{value}\n')
            book_file.write('\n')
    pass


def main():
    text = getHtmlText(seed_url)
    book_list = parserText(text, books)
    writeToFile(sortedBookTop250(book_list))
    pass


if __name__ == '__main__':
    seed_url = "https://book.douban.com/top250"
    books = []
    main()

summary

> - Click on my Github > - Click to view my personal Blog >- Dai Gong One Pawn, Accelerated without Expectation

The above code is directly posted, which is very simple two pieces of code, mainly used in the requests library and beautifulsoup library, need can be taken directly, or go directly to my GIthub Take movies.txt and good_books.txt

Keywords: encoding github Python

Added by amitshetye on Wed, 02 Oct 2019 15:16:35 +0300

Programming VIP

Douban Film TOP250 and Book TOP250 Reptiles

Douban Film TOP250 and Book TOP250 Reptiles

Movie TOP250 Reptiles

Books TOP250 Reptiles

summary

Popular Keywords