Crawling the data with map display in the web page using Python graph

Recently, there is a need to crawl the official website information of a logistics company. I saw the official website, which is basically static pages. Unlike that kind of information type, e-commerce type websites have complex results, strict anti climbing, many AJAX, and I am secretly glad that when I further analyzed, I found that they are not ordinary static pages.
For example, in this URL interface, I want to get the distribution information of logistics parks in all major cities in China, as well as the detailed information,
There is a map inlay in this page. You need to click the information on the map separately to display the logistics information of each city.
https://www.glprop.com.cn/our...

At first, I thought, is this kind of ajax request? I didn't find it through chrmoe. Then I checked the source code of the web page
Find all the city information in one script
As shown in the picture:

Then the information of each park is stored in a park={xx}

It's all here. Get the source code directly, regular matching, and open up.
item:

#Plus
class PuluosiNewsItem(scrapy.Item):
    newstitle=scrapy.Field()
    newtiems=scrapy.Field()
    newslink=scrapy.Field()
class PuluosiItem(scrapy.Item):
    assetstitle = scrapy.Field()
    assetaddress=scrapy.Field()
    assetgaikuang=scrapy.Field()
    assetpeople=scrapy.Field()
    asseturl = scrapy.Field()

pipelines:

class PuluosiNewsPipeline(object):
    def __init__(self):
        self.wb=Workbook()
        self.ws=self.wb.active
        #Set the header
        self.ws.append(['News headline of prosper','Press release time','Journalism URL'])
        self.wb2 = Workbook()
        self.ws2 = self.wb2.active
        self.ws2.append(['Asset title', 'Asset address', 'Asset profile','Other information','URL'])
    def process_item(self,item,spider):
        if isinstance(item, PuluosiNewsItem):
            line = [item['newstitle'], item['newtiems'], item['newslink']]  # Sort out every item in the data
            self.ws.append(line)
            self.wb.save('PuluosiNews.xlsx')  # Save xlsx file
        elif isinstance(item,PuluosiItem):
            line = [item['assetstitle'], item['assetaddress'], item['assetgaikuang'],item['assetpeople'],item['asseturl']]
            self.ws2.append(line)
            self.wb2.save('PuluosiAsset.xlsx')  # Save xlsx file
        return item

spider:

# -*- coding: utf-8 -*-
import scrapy,re,json
from news.items import PuluosiNewsItem,PuluosiItem
from scrapy.linkextractors import LinkExtractor

class PuluosiSpider(scrapy.Spider):
    name = 'puluosi'
    allowed_domains = ['glprop.com.cn']
    # start_urls = ['https://www.glprop.com.cn/press-releases.html']

    def start_requests(self):
        yield scrapy.Request('https://www.glprop.com.cn/press-releases.html', self.parse1)
        yield scrapy.Request('https://www.glprop.com.cn/in-the-news.html', self.parse2)
        yield scrapy.Request('https://www.glprop.com.cn/proposed-privatization.html', self.parse3)
        yield scrapy.Request('https://www.glprop.com.cn/our-network/network-detail.html', self.parse4)

    def parse1(self, response):
        print('The crawlers started at this time are: puluosi' )
        item=PuluosiNewsItem()
        web=response.xpath('//tbody/tr')
        web.pop(0)
        for node in  web:
            item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip()
            print(item['newstitle'])
            item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip()
            print(item['newtiems'])
            # urljoin creates an absolute link path, which is used for the connection with the relative path as the href value in the web page
            item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0])
            # print(item['newslink'])
            yield item
        #Add try to determine whether the next page of the current year's news appears
        try:
            next_url_tmp = response.xpath('//div[@class="page"]/a[contains(text(), "next page")] / @ href').extract()[0]
            if next_url_tmp:
                next_url = "https://www.glprop.com.cn" + next_url_tmp
                yield scrapy.Request(next_url,callback=self.parse1)
        except Exception as e:
            print("Current page has no next page")
        href=response.xpath('//ul[@class="timeList"]/li/a/@href')
        for nexturl in href:
            url1 =nexturl.extract()
            if url1:
                url="https://www.glprop.com.cn"+url1
                yield scrapy.Request(url,callback=self.parse1)

    def parse2(self,response):
        item = PuluosiNewsItem()
        web = response.xpath('//tbody/tr')
        web.pop(0)
        for node in  web:
            item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip()
            print(item['newstitle'])
            item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip()
            print(item['newtiems'])
            # urljoin creates an absolute link path, which is used for the connection with the relative path as the href value in the web page
            item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0])
            print(item['newslink'])
            yield item
        #Add try to determine whether the next page of the current year's news appears
        try:
            next_url_tmp = response.xpath('//div[@class="page"]/a[contains(text(), "next page")] / @ href').extract()[0]
            if next_url_tmp:
                next_url = "https://www.glprop.com.cn" + next_url_tmp
                yield scrapy.Request(next_url,callback=self.parse2)
        except Exception as e:
            print("Current page has no next page")
        href=response.xpath('//ul[@class="timeList"]/li/a/@href')
        for nexturl in href:
            url1 =nexturl.extract()
            if url1:
                url="https://www.glprop.com.cn"+url1
                yield scrapy.Request(url,callback=self.parse2)

    def parse3(self,response):
        item=PuluosiNewsItem()
        web=response.xpath('//tbody/tr')
        web.pop()
        for node in  web:
            item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip()
            print(item['newstitle'])
            item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip()
            print(item['newtiems'])
            # urljoin creates an absolute link path, which is used for the connection with the relative path as the href value in the web page
            item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0])
            print(item['newslink'])
            yield item

    def parse4(self,response):
        link=LinkExtractor(restrict_xpaths='//div[@class="net_pop1"]//div[@class="city"]')
        links=link.extract_links(response)
        #Get links for all cities
        for i in links:
            detailurl=i.url
            yield scrapy.Request(url=detailurl,callback=self.parse5)

    def parse4(self, response):
        item = PuluosiItem()
        citycode=re.findall('var cities =(.*);',response.text )
        citycodejson=json.loads(("".join(citycode)))
        #Take out the id and name of each city and put them in a dictionary
        dictcity={}
        for i in citycodejson:
            citycodename=i['name']
            citycodenm=i['id']
            dictcity[citycodenm]=citycodename
        detail=re.findall('var parks =(.*);',response.text )
        jsonBody = json.loads(("".join(detail)))
        list = []
        for key1 in jsonBody:
            for key2  in jsonBody[key1]:
                tmp=jsonBody[key1][key2]
                list.append(jsonBody[key1][key2])
        for node in list:
            assetaddress = node['city_id']
            item['assetaddress'] = dictcity[assetaddress]
            # print(item['assetaddress'])
            item['assetstitle'] = node['name']
            # print(item['assetstitle'])
            item['assetgaikuang'] = node['detail_single'].strip().replace(' ', '').replace(' ', '')
            # print(item['assetgaikuang'])
            assetpeople = node['description']
            item['assetpeople'] = re.sub(r'<.*?>', '', (assetpeople.strip())).replace('&nbsp;', '')
            item['asseturl']='https://www.glprop.com.cn/network-city-detail.html?city='+item['assetaddress']
            # print(item['assetpeople'])
            yield item

Then I crawled the news information of the page by the way.

Keywords: Python JSON network

Added by unbreakable9 on Thu, 21 Nov 2019 18:01:05 +0200