Small crawling 2: Chinese code scrambling and other issues

1. Solve the problem of Chinese code disorder

(1) dynamic loading or not,

(2) source data acquisition

Map network on the other side:

First page address: http://pic.netbian.com/4kmeinv/

Second pages: http://pic.netbian.com/4kmeinv/index_2.html

Third pages: http://pic.netbian.com/4kmeinv/index_3.html

#Step 1: there is a bug in the following code we wrote, and the returned Chinese has a problem of garbled code

import requests
from lxml import etree
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
start_page=int(input('start page num:'))
end_page=int(input('end page num:'))
#General url template (template cannot be modified)
url='http://pic.netbian.com/4kmeinv/index_%d.html)'
for page in range(start_page,end_page):
    if page==1:
        new_url='http://pic.netbian.com/4kmeinv/'
    else:
        new_url=format(url%page)
    page_text=requests.get(url=new_url,headers=headers).text
    #Resolve the src attribute value of name and picture
    tree=etree.HTML(page_text)
    li_list=tree.xpath('//div[@class="slist"]/ul/li')
    for li in li_list:
        img_name=li.xpath('./a/img/@alt')[0]
        img_src=li.xpath('./a/img/@src')[0]
        print(img_name,img_src)

#Step 2: modify. The following results will change, but the results still have problems

import requests
from lxml import etree
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
start_page=int(input('start page num:'))
end_page=int(input('end page num:'))
#General url template (template cannot be modified)
url='http://pic.netbian.com/4kmeinv/index_%d.html)'
for page in range(start_page,end_page):
    if page==1:
        new_url='http://pic.netbian.com/4kmeinv/'
    else:
        new_url=format(url%page)
    response=requests.get(url=new_url,headers=headers)
    response.encoding='utf-8'
    page_text=response.text
    #Resolve the src attribute value of name and picture
    tree=etree.HTML(page_text)
    li_list=tree.xpath('//div[@class="slist"]/ul/li')
    for li in li_list:
        img_name=li.xpath('./a/img/@alt')[0]
        img_src=li.xpath('./a/img/@src')[0]
        print(img_name,img_src)

Step 3: we further upgrade

import requests
from lxml import etree
headers={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
start_page=int(input('start page num:'))
end_page=int(input('end page num:'))
#General url template (template cannot be modified)
url='http://pic.netbian.com/4kmeinv/index_%d.html)'
for page in range(start_page,end_page):
    if page==1:
        new_url='http://pic.netbian.com/4kmeinv/'
    else:
        new_url=format(url%page)
    response=requests.get(url=new_url,headers=headers)
#     response.encoding='utf-8'
    page_text=response.text
    #Resolve the src attribute value of name and picture
    tree=etree.HTML(page_text)
    li_list=tree.xpath('//div[@class="slist"]/ul/li')
    for li in li_list:
        img_name=li.xpath('./a/img/@alt')[0]
        img_name=img_name.encode('iso-8859-1').decode('gbk')
        img_src=li.xpath('./a/img/@src')[0]
        print(img_name,img_src)

Step 4: further upgrade

Keywords: PHP Attribute Windows encoding network

Added by BenGilbert on Sun, 03 Nov 2019 17:38:17 +0200