Use Requests and beautiful soup to crawl the girl map

Project Description:

Python novice, the first crawler project, web crawler is an activity that can enhance the interest of programming learning, so that Python learning is no longer boring.

Python version 3.7.2

Modules needed: requests, os, beautiful soup

Crawler destination https://www.mzitu.com/xinggan/

Project realization:

First, import the module and configure the request header. If there is no image address, it will be an empty link

# -*- coding:utf-8 -*-
import requests,os
from bs4 import BeautifulSoup
Path = 'D:\\MeiZ\\'
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
          'Referer': 'https://www.mzitu.com/163367'}

 

I. get the source code of the web page

def getContent(url):
    ##requests.get returns a response
    res = requests.get(url,headers = header)
    ##The beauty soup interpreter uses lxml faster than html5lib
    soup = BeautifulSoup(res.text,'lxml')
    return soup

2. Get the atlas address

Right click "check" on the chrome image,
It is found that there are too many a tags to find full text, so you can use img tags to get parent tags through parent
def get_album_list(url):
    soup = getContent(url)
    ##Find all img tags and return a list
    artical = soup.findAll('img',{'class':'lazy'})
    ##Set an empty list and save the atlas address
    artical_list = []
    for i in artical:
        ##Parent method takes the parent label content
        artical_list.append(i.parent['href'])
    return artical_list

3. Get the title and page number of atlas

 

Enter the atlas, use the css selector to take out the title and page number, check the title, and the class name is main title

Search page number with span label

def getPage_Title(url):
    soup = getContent(url)
    ##css selector returns list
    ##Take the first one out of the list
    title = soup.select('.main-title')[0].text
    ##Page number is 11th in the list
    page = soup.select('span')[10].text
    return page,title

IV. access to image address

 

Check the image. The image link is in src of img tag

def getImg_url(url):
    soup = getContent(url)
    ##Find based on the properties of the label
    ##find(name, attrs, recursive, text, **wargs)
    Img_url = soup.find('img')['src']
    return Img_url

V. download pictures

def down_Img(url,title,count):
    res = requests.get(url,headers = header)
    ##exception handling
    try:
        ##Count count save
        with open(Path + title + '\\' + str(count)  +'.jpg','wb') as file:
                file.write(res.content)
                ##The file needs to be closed after writing, otherwise it will be in memory all the time
                file.close()
    except:
        print("ERROR")

Vi. main function series method

##Start program check for D:\MeiZ \ \ directory
if not os.path.isdir(Path):
    os.mkdir(Path)

The atlas list address is https://www.mzitu.com/xinggan/page/ plus the number of pages, the total page number is 135

url = 'https://www.mzitu.com/xinggan/page/'
##Cycle each page
for t in range(1,135):
    url = url + str(t)
##Get atlas address
album_list = get_album_list(url)
for index in range(0,len(album_list)):
    ##Loop out the total number of titles and pages
    Page_Title = getPage_Title(album_list[index])
    page_num = Page_Title[0]
    title = Page_Title[1]
    print("Downloading:" + title)
##Create subdirectory with title, skip download if it exists
if not os.path.isdir(Path + title):
    os.mkdir(Path + title)
    for i in range(1,int(page_num)):
        ##Take the total number of page numbers of the atlas and loop out the address of each picture
        Page_url = album_list[index] + "/" + str(i)
        Img_Info = getImg_url(Page_url)
        down_Img(Img_Info,title,i)
        print("Downloading" + str(i) + "/" + str(page_num))
else:
    print("Downloaded skipped")

7. All codes

# -*- coding:utf-8 -*-
#__author__ = 'vic'

import requests,os
from bs4 import BeautifulSoup
Path = 'D:\\MeiZ\\'
header = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36',
          'Referer': 'https://www.mzitu.com/163367'}

##Web page html
def getContent(url):
    res = requests.get(url,headers = header)
    soup = BeautifulSoup(res.text,'lxml')
    return soup

##Get atlas address
def get_album_list(url):
    soup = getContent(url)
    artical = soup.findAll('img',{'class':'lazy'})
    artical_list = []
    for i in artical:
        artical_list.append(i.parent['href'])
    return artical_list

##Get page number of atlas
def getPage_Title(url):
    soup = getContent(url)
    page = soup.select('span')[10]
    title = soup.select('.main-title')[0].text
    return page.text,title

##Get picture address
def getImg_url(url):
    soup = getContent(url)
    Img_url = soup.find('img')['src']
    return Img_url

##download
def down_Img(url,title,count):
    res = requests.get(url,headers = header)
    try:
        with open(Path + title + '\\' + str(count)  +'.jpg','wb') as file:
                file.write(res.content)
                file.close()
    except:
        print("ERROR")

if __name__ == '__main__':
    if not os.path.isdir(Path):
        os.mkdir(Path)
    url = 'https://www.mzitu.com/xinggan/page/'
    for t in range(1,135):
        url = url + str(t)
        album_list = get_album_list(url)
        for index in range(0,len(album_list)):
            Page_Title = getPage_Title(album_list[index])
            page_num = Page_Title[0]
            title = Page_Title[1]
            print("Downloading:" + title)
            if not os.path.isdir(Path + title):
                os.mkdir(Path + title)
                for i in range(1,int(page_num)):
                    Page_url = album_list[index] + "/" + str(i)
                    Img_Info = getImg_url(Page_url)
                    down_Img(Img_Info,title,i)
                    print("Downloading" + str(i) + "/" + str(page_num))
            else:
                print(title + "Downloaded skipped")

 

VIII. Project achievements

Prepare to climb grass for the next project

Keywords: Python Windows Programming

Added by satheshf12000 on Sun, 08 Dec 2019 06:52:26 +0200