After class case of Python learning day 7 (crawling and analyzing website information, crawling pictures)

Crawler for text

1. Crawl the title of the article on the homepage of Netease and do word frequency and word cloud analysis

Program code:
# Import module package
import requests
import re
import jieba
import wordcloud

# Crawling object links
url = 'https://www.163.com/'
# Get web information
response = requests.get(url)
data = response.text
# print(data)
# Get the data you need by regular
title_res = re.findall('<li.*?><a href="https://news.163.com/.*?.html">(.*?)</a>',data)
# Create a character variable that never receives crawled information and becomes an entire string
str_data = ''
for i in title_res:
    # Splicing crawling information together
    str_data += i

# word frequency count
# Create a dictionary for word statistics
data_dict = {}
# Cut all the words first
str_lcut = jieba.lcut(str_data)
for i in str_lcut:
    # Remove some invalid data
    if i == ',' or i == ' ' or i == ':' or len(i) <2 :
        continue
    # Summarize and count every word that appears
    if i in data_dict.keys():
        data_dict[i] += 1
    else:
        data_dict[i] = 1
# Define a function to retrieve the next item of data in the list
def find_data(i):
    return i[1]
# Convert the number of occurrences of each word and the dictionary of the word into a list.
list_data = list(data_dict.items())
# Sort the data in the list
list_data.sort(key=find_data,reverse=True)
# # Take out the most twenty words in the list
for i in list_data[:21]:
    print(f'{i[0]:<10} {i[1]:^5}')

# # The production of CI cloud
# Define a variable to store the characters to be word cloud
cloud_data = ""
# Put the characters crawled out before into the cloud_data variable
for i in list_data:
    cloud_data += i[0]+" "
# Create a word cloud panel
w = wordcloud.WordCloud(background_color="white",font_path=r"C:\Windows\Fonts\FZLTCXHJW.TTF",width=1000,height=800)
# Write characters to the word cloud panel
w.generate(cloud_data)
# Output well made word cloud
w.to_file('pc.jpg')
Operation result:

Xi Da 6
United Arab Emirates 4
Abu Dhabi 4
Crown Prince 4
4 increase
em 4
Blower 4
High speed 3
Hold ceremony 2
Welcome 2
Visiting China 2
Hold talks 2
About 2
Story 2
The first half of 2
2 of our country
Investment 2
Structure 2
Last 2
Optimization 2
Power 2

Word cloud produced:

Crawling cases for pictures

The second question is the picture of the website

Program code:
# Step 1 import module package
import requests
import re
import time

# Write a function to crawl pictures
def get_images(type_num,nums):
    # Get the data of how many pages to crawl
    img_type = ''
    if type_num == '1':
        img_type = 'article'
    elif type_num == '2':
        img_type = 'photo'
    elif type_num == '3':
        img_type = 'zz'
    for i in range(int(nums)):
       # Set the url of the website to crawl
       url = f'https://www.doutula.com/{img_type}/list/?page={i+1}'
       # Get web information
       response = requests.get(url)
       # Take out the source code of the web page
       page_code = response.text

       imgs_url = re.findall('data-original="(.*?)"', page_code)
       for img_url in imgs_url:
           # Get information about the picture and store it
           img_res = requests.get(img_url)
           img_data = img_res.content
           img_name = img_url.split('/')[-1]
           with open('images\\' + img_name[-10:], 'wb') as f:
               f.write(img_data)
               time.sleep(0.1)
               f.flush()
           print(f'Save the first{i+1}Page, picture{img_name[-10:]}Success')
start = 'Expression bag acquisition system----If you have this skill, you will never overturn'
print(f'{start:*^50}')
type_num = input("Please select the type of picture to crawl (1.Picture expression pack, 2.The latest expression pack, 3.Emoticon bag template):")
nums = input("Please enter the number of pages to crawl:")
get_images(type_num,nums)
Operation result:

*************Emoticon bag acquisition system - with this skill, you will never overturn from this fight map**************
Please select the type of picture you want to crawl (1. Picture expression pack, 2. The latest expression pack, 3. Expression pack template): 2
Please enter the number of pages to crawl: 1
Save page 1, picture 095t8x.jpg succeeded
Save page 1, picture 06odfz.jpg succeeded
Save page 1, picture 05074f.jpg succeeded
Save page 1, picture 051q2w.gif succeeded
Save page 1, picture 0k0mxg.jpg succeeded
Save page 1, picture 08cweg.jpg succeeded
Save page 1, picture 0k0wes.jpg succeeded
Save page 1, picture 034a9v.jpg succeeded
Save page 1, picture 05k3yz.gif succeeded

Keywords: PHP Windows

Added by mosizlak on Sat, 19 Oct 2019 00:09:56 +0300