web crawler explanation - urllib repository crawler - basic use - timeout settings - Automatic Simulation of http requests

Writing simple crawlers using the urllib library that comes with the python system

urlopen() Gets the html source for a URL
read() Read the html source content
decode("utf-8") converts bytes to strings

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import urllib.request
html = urllib.request.urlopen('http://edu.51cto.com/course/8360.html').read().decode("utf-8")
print(html)
<!DOCTYPE html>
<html>
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <meta name="csrf-param" content="_csrf">
    <meta name="csrf-token" content="X1pZZnpKWnQAIGkLFisPFT4jLlJNIWMHHWM6HBBnbiwPbz4/LH1pWQ==">

Get page assignments regularly

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import urllib.request
import re
html = urllib.request.urlopen('http://edu.51cto.com/course/8360.html').read().decode("utf-8")#Get HTML source
pat = "51CTO college Python Actual battle group\((\d*?)\)"      #Regular rule, get QQ number
rst = re.compile(pat).findall(html)
print(rst)

#['325935753']

urlretrieve() to download and save network files locally, parameter 1 network file URL, parameter 2 save path

#!/usr/bin/env python
# -*- coding:utf-8 -*-
from urllib import request
import re
import os

file_path = os.path.join(os.getcwd() + '/222.html')    #Path to Stitch File Save
# print(file_path)
request.urlretrieve('http://edu.51cto.com/course/8360.html', file_path) #Download this file and save it to the specified path

urlcleanup() clears the memory generated by the crawler

#!/usr/bin/env python
# -*- coding:utf-8 -*-
from urllib import request
import re
import os

file_path = os.path.join(os.getcwd() + '/222.html')    #Path to Stitch File Save
# print(file_path)
request.urlretrieve('http://edu.51cto.com/course/8360.html', file_path) #Download this file and save it to the specified path
request.urlcleanup()

info() View an introduction to grabbing pages

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import urllib.request
import re
html = urllib.request.urlopen('http://edu.51cto.com/course/8360.html') #Get HTML source
a = html.info()
print(a)

# C:\Users\admin\AppData\Local\Programs\Python\Python35\python.exe H:/py/15/chshi.py
# Date: Tue, 25 Jul 2017 16:08:17 GMT
# Content-Type: text/html; charset=UTF-8
# Transfer-Encoding: chunked
# Connection: close
# Set-Cookie: aliyungf_tc=AQAAALB8CzAikwwA9aReq63oa31pNIez; Path=/; HttpOnly
# Server: Tengine
# Vary: Accept-Encoding
# Vary: Accept-Encoding
# Vary: Accept-Encoding

getcode() gets the status code

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import urllib.request
import re
html = urllib.request.urlopen('http://edu.51cto.com/course/8360.html') #Get HTML source
a = html.getcode()  #Get Status Code
print(a)

#200

geturl() Gets the URL of the currently captured page

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import urllib.request
import re
html = urllib.request.urlopen('http://edu.51cto.com/course/8360.html') #Get HTML source
a = html.geturl()  #Get the URL of the currently captured page
print(a)

#http://edu.51cto.com/course/8360.html

Timeout grab timeout setting in seconds

Set a time-out period when a page is grabbed and the server on the other side responds too slowly or does not respond for a long time, so that the page is not grabbed after the time-out period

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import urllib.request
import re
html = urllib.request.urlopen('http://edu.51cto.com/course/8360.html',timeout=30) #Get HTML source
a = html.geturl()  #Get the URL of the currently captured page
print(a)

#http://edu.51cto.com/course/8360.html

Automatically simulate http requests

http requests are usually get and post requests

get request

A 360 search, for example, is one that gets data by requesting a get and passing a user's search keyword to the server

So we can simulate Baidu http request and construct keyword auto request

quote() Transcodes keywords into characters recognized by the browser. The default website cannot be Chinese

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import urllib.request
import re
gjc = "Mobile phone"     #Set keywords
gjc = urllib.request.quote(gjc)         #Transcoding keywords into browser-recognized characters, default website cannot be Chinese
url = "https://www.so.com/s?q="+gjc #Construct url address"
# print(url)
html = urllib.request.urlopen(url).read().decode("utf-8")  #Get html source
pat = "(\w*<em>\w*</em>\w*)"            #Get Related Titles Regularly
rst = re.compile(pat).findall(html)
# print(rst)
for i in rst:
    print(i)                            #Loop out the caption you get

    # Official < EM > Mobile < / EM >
    # Official < EM > Mobile < / EM >
    # Official < EM > Mobile < / EM > such a low price
    # Big < EM > Mobile < / EM > Low-priced
    # < EM > Mobile < / EM >
    # Taobao recommends < EM > mobile phone < / EM >
    # < EM > Mobile < / EM >
    # < EM > Mobile < / EM >
    # < EM > Mobile < / EM >
    # < EM > Mobile < / EM >
    # SUNINGYI BUY < EM > Mobile phone < / EM >
    # Buy < EM > Mobile < / EM >
    # Buy < EM > Mobile < / EM >

post request

urlencode() encapsulates the form data submitted by the post request, with the parameter being a dictionary of key-value pair form data
Request() submits a post request, parameter 1 is the url address, and parameter 2 is the encapsulated form data

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import urllib.request
import urllib.parse

posturl = "http://www.iqianyue.com/mypost/"
shuju = urllib.parse.urlencode({                #urlencode() encapsulates the form data submitted by the post request, with the parameter being a dictionary of key-value pair form data
    'name': '123',
    'pass': '456'
    }).encode('utf-8')
req = urllib.request.Request(posturl,shuju)     #Request() submits a post request, parameter 1 is the url address, and parameter 2 is the encapsulated form data
html = urllib.request.urlopen(req).read().decode("utf-8")  #Get the page returned by the post request
print(html)

Keywords: Python Mobile encoding network

Added by Rushy on Wed, 28 Aug 2019 02:13:02 +0300