Crawler for text
1. Crawl the title of the article on the homepage of Netease and do word frequency and word cloud analysis
Program code:
# Import module package import requests import re import jieba import wordcloud # Crawling object links url = 'https://www.163.com/' # Get web information response = requests.get(url) data = response.text # print(data) # Get the data you need by regular title_res = re.findall('<li.*?><a href="https://news.163.com/.*?.html">(.*?)</a>',data) # Create a character variable that never receives crawled information and becomes an entire string str_data = '' for i in title_res: # Splicing crawling information together str_data += i # word frequency count # Create a dictionary for word statistics data_dict = {} # Cut all the words first str_lcut = jieba.lcut(str_data) for i in str_lcut: # Remove some invalid data if i == ',' or i == ' ' or i == ':' or len(i) <2 : continue # Summarize and count every word that appears if i in data_dict.keys(): data_dict[i] += 1 else: data_dict[i] = 1 # Define a function to retrieve the next item of data in the list def find_data(i): return i[1] # Convert the number of occurrences of each word and the dictionary of the word into a list. list_data = list(data_dict.items()) # Sort the data in the list list_data.sort(key=find_data,reverse=True) # # Take out the most twenty words in the list for i in list_data[:21]: print(f'{i[0]:<10} {i[1]:^5}') # # The production of CI cloud # Define a variable to store the characters to be word cloud cloud_data = "" # Put the characters crawled out before into the cloud_data variable for i in list_data: cloud_data += i[0]+" " # Create a word cloud panel w = wordcloud.WordCloud(background_color="white",font_path=r"C:\Windows\Fonts\FZLTCXHJW.TTF",width=1000,height=800) # Write characters to the word cloud panel w.generate(cloud_data) # Output well made word cloud w.to_file('pc.jpg')
Operation result:
Xi Da 6
United Arab Emirates 4
Abu Dhabi 4
Crown Prince 4
4 increase
em 4
Blower 4
High speed 3
Hold ceremony 2
Welcome 2
Visiting China 2
Hold talks 2
About 2
Story 2
The first half of 2
2 of our country
Investment 2
Structure 2
Last 2
Optimization 2
Power 2
Word cloud produced:
Crawling cases for pictures
The second question is the picture of the website
Program code:
# Step 1 import module package import requests import re import time # Write a function to crawl pictures def get_images(type_num,nums): # Get the data of how many pages to crawl img_type = '' if type_num == '1': img_type = 'article' elif type_num == '2': img_type = 'photo' elif type_num == '3': img_type = 'zz' for i in range(int(nums)): # Set the url of the website to crawl url = f'https://www.doutula.com/{img_type}/list/?page={i+1}' # Get web information response = requests.get(url) # Take out the source code of the web page page_code = response.text imgs_url = re.findall('data-original="(.*?)"', page_code) for img_url in imgs_url: # Get information about the picture and store it img_res = requests.get(img_url) img_data = img_res.content img_name = img_url.split('/')[-1] with open('images\\' + img_name[-10:], 'wb') as f: f.write(img_data) time.sleep(0.1) f.flush() print(f'Save the first{i+1}Page, picture{img_name[-10:]}Success') start = 'Expression bag acquisition system----If you have this skill, you will never overturn' print(f'{start:*^50}') type_num = input("Please select the type of picture to crawl (1.Picture expression pack, 2.The latest expression pack, 3.Emoticon bag template):") nums = input("Please enter the number of pages to crawl:") get_images(type_num,nums)
Operation result:
*************Emoticon bag acquisition system - with this skill, you will never overturn from this fight map**************
Please select the type of picture you want to crawl (1. Picture expression pack, 2. The latest expression pack, 3. Expression pack template): 2
Please enter the number of pages to crawl: 1
Save page 1, picture 095t8x.jpg succeeded
Save page 1, picture 06odfz.jpg succeeded
Save page 1, picture 05074f.jpg succeeded
Save page 1, picture 051q2w.gif succeeded
Save page 1, picture 0k0mxg.jpg succeeded
Save page 1, picture 08cweg.jpg succeeded
Save page 1, picture 0k0wes.jpg succeeded
Save page 1, picture 034a9v.jpg succeeded
Save page 1, picture 05k3yz.gif succeeded