Douban Film TOP250 and Book TOP250 Reptiles
Recently, I started to play Python. When I learned about crawlers, I had a whim. I crawled the toP250 movie and the book TOP250. Here I record the process of my playing.
Movie TOP250 Reptiles
import requests from bs4 import BeautifulSoup import time def getlist(list_url): time.sleep(2) res = requests.get(list_url) soup = BeautifulSoup(res.text, 'html.parser') movie_list = soup.select('.grid_view li') for m in movie_list: rank = m.select('em')[0].text score = m.select('.rating_num')[0].text title = m.select('.title')[0].text direct = m.select('.info .bd p')[0].text.strip() actor = '\n To star:'.join(direct.split(' To star:')) director = 'years:'.join(actor.split(' ')) if m.select('.inq'): comments = m.select('.inq')[0].text.strip() else: comments = 'None' movie.append( 'ranking: ' + rank + '\n' + 'score: ' + score + '\n' + 'title: ' + title + '\n' + director + '\n' + 'comment: ' + comments + '\n' + '\n') if soup.select('.next a'): asoup = soup.select('.next a')[0]['href'] next_page = seed_url + asoup getlist(next_page) else: print('End') return movie def write(movies): with open('movie.txt', 'w', encoding='utf8') as m: for a in movies: m.write(a) def main(): write(getlist(seed_url)) pass if __name__ == '__main__': seed_url = 'https://movie.douban.com/top250' movie = [] main()
Books TOP250 Reptiles
import bs4 import requests import re from bs4 import BeautifulSoup from operator import itemgetter def getHtmlText(url): try: r = requests.get(url) r.raise_for_status() r.encoding = r.apparent_encoding return r.text except: return "" def parserText(text, book_list): soup = BeautifulSoup(text, 'html.parser') for table in soup('table', {'width': '100%'}): if isinstance(table, bs4.element.Tag): tds = table.find('tr')('td') divs = tds[1]('div') content = {} for div in divs: if isinstance(div, bs4.element.Tag): if div.find('a'): name = div.find('a').attrs['title'] content.update({"Title": name}) if div.select('.rating_nums'): score = div.select('.rating_nums')[0].text content.update({"score": score}) if div.select('.pl'): people_num = div.select('.pl')[0].text regex = re.compile(r'[\d]{1,10}') content.update({"Number of people assessed": regex.findall(people_num)[0]}) ps = tds[1]('p') for p in ps: if isinstance(p, bs4.element.Tag): if p.attrs['class'][0] == 'quote': description = p.find('span').string content.update({"introduce": description}) if p.attrs['class'][0] == 'pl': author = p.string content.update({"Author information": author}) book_list.append(content) next_books = soup.find('span', {'class': 'next'}) if next_books.find('a'): a = next_books.find('a').attrs['href'] text = getHtmlText(a) parserText(text, books) return book_list def sortedBookTop250(book_list): tmp = sorted(book_list, key=itemgetter('score'), reverse=True) for i in range(len(tmp)): tmp[i].update({"ranking": i + 1}) return tmp def writeToFile(book_list): with open('good_books.txt', 'w', encoding='utf8') as book_file: for book in book_list: for key, value in book.items(): book_file.write(f'{key}:{value}\n') book_file.write('\n') pass def main(): text = getHtmlText(seed_url) book_list = parserText(text, books) writeToFile(sortedBookTop250(book_list)) pass if __name__ == '__main__': seed_url = "https://book.douban.com/top250" books = [] main()
summary
> - Click on my Github > - Click to view my personal Blog >- Dai Gong One Pawn, Accelerated without Expectation
The above code is directly posted, which is very simple two pieces of code, mainly used in the requests library and beautifulsoup library, need can be taken directly, or go directly to my GIthub Take movies.txt and good_books.txt