Recently, there is a need to crawl the official website information of a logistics company. I saw the official website, which is basically static pages. Unlike that kind of information type, e-commerce type websites have complex results, strict anti climbing, many AJAX, and I am secretly glad that when I further analyzed, I found that they are not ordinary static pages.
For example, in this URL interface, I want to get the distribution information of logistics parks in all major cities in China, as well as the detailed information,
There is a map inlay in this page. You need to click the information on the map separately to display the logistics information of each city.
https://www.glprop.com.cn/our...
At first, I thought, is this kind of ajax request? I didn't find it through chrmoe. Then I checked the source code of the web page
Find all the city information in one script
As shown in the picture:
Then the information of each park is stored in a park={xx}
It's all here. Get the source code directly, regular matching, and open up.
item:
#Plus class PuluosiNewsItem(scrapy.Item): newstitle=scrapy.Field() newtiems=scrapy.Field() newslink=scrapy.Field() class PuluosiItem(scrapy.Item): assetstitle = scrapy.Field() assetaddress=scrapy.Field() assetgaikuang=scrapy.Field() assetpeople=scrapy.Field() asseturl = scrapy.Field()
pipelines:
class PuluosiNewsPipeline(object): def __init__(self): self.wb=Workbook() self.ws=self.wb.active #Set the header self.ws.append(['News headline of prosper','Press release time','Journalism URL']) self.wb2 = Workbook() self.ws2 = self.wb2.active self.ws2.append(['Asset title', 'Asset address', 'Asset profile','Other information','URL']) def process_item(self,item,spider): if isinstance(item, PuluosiNewsItem): line = [item['newstitle'], item['newtiems'], item['newslink']] # Sort out every item in the data self.ws.append(line) self.wb.save('PuluosiNews.xlsx') # Save xlsx file elif isinstance(item,PuluosiItem): line = [item['assetstitle'], item['assetaddress'], item['assetgaikuang'],item['assetpeople'],item['asseturl']] self.ws2.append(line) self.wb2.save('PuluosiAsset.xlsx') # Save xlsx file return item
spider:
# -*- coding: utf-8 -*- import scrapy,re,json from news.items import PuluosiNewsItem,PuluosiItem from scrapy.linkextractors import LinkExtractor class PuluosiSpider(scrapy.Spider): name = 'puluosi' allowed_domains = ['glprop.com.cn'] # start_urls = ['https://www.glprop.com.cn/press-releases.html'] def start_requests(self): yield scrapy.Request('https://www.glprop.com.cn/press-releases.html', self.parse1) yield scrapy.Request('https://www.glprop.com.cn/in-the-news.html', self.parse2) yield scrapy.Request('https://www.glprop.com.cn/proposed-privatization.html', self.parse3) yield scrapy.Request('https://www.glprop.com.cn/our-network/network-detail.html', self.parse4) def parse1(self, response): print('The crawlers started at this time are: puluosi' ) item=PuluosiNewsItem() web=response.xpath('//tbody/tr') web.pop(0) for node in web: item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip() print(item['newstitle']) item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip() print(item['newtiems']) # urljoin creates an absolute link path, which is used for the connection with the relative path as the href value in the web page item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0]) # print(item['newslink']) yield item #Add try to determine whether the next page of the current year's news appears try: next_url_tmp = response.xpath('//div[@class="page"]/a[contains(text(), "next page")] / @ href').extract()[0] if next_url_tmp: next_url = "https://www.glprop.com.cn" + next_url_tmp yield scrapy.Request(next_url,callback=self.parse1) except Exception as e: print("Current page has no next page") href=response.xpath('//ul[@class="timeList"]/li/a/@href') for nexturl in href: url1 =nexturl.extract() if url1: url="https://www.glprop.com.cn"+url1 yield scrapy.Request(url,callback=self.parse1) def parse2(self,response): item = PuluosiNewsItem() web = response.xpath('//tbody/tr') web.pop(0) for node in web: item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip() print(item['newstitle']) item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip() print(item['newtiems']) # urljoin creates an absolute link path, which is used for the connection with the relative path as the href value in the web page item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0]) print(item['newslink']) yield item #Add try to determine whether the next page of the current year's news appears try: next_url_tmp = response.xpath('//div[@class="page"]/a[contains(text(), "next page")] / @ href').extract()[0] if next_url_tmp: next_url = "https://www.glprop.com.cn" + next_url_tmp yield scrapy.Request(next_url,callback=self.parse2) except Exception as e: print("Current page has no next page") href=response.xpath('//ul[@class="timeList"]/li/a/@href') for nexturl in href: url1 =nexturl.extract() if url1: url="https://www.glprop.com.cn"+url1 yield scrapy.Request(url,callback=self.parse2) def parse3(self,response): item=PuluosiNewsItem() web=response.xpath('//tbody/tr') web.pop() for node in web: item['newstitle'] = node.xpath('.//a/text()').extract()[0].strip() print(item['newstitle']) item['newtiems'] = node.xpath('.//td/text()').extract()[0].strip() print(item['newtiems']) # urljoin creates an absolute link path, which is used for the connection with the relative path as the href value in the web page item['newslink'] = response.urljoin(web.xpath('.//a/@href').extract()[0]) print(item['newslink']) yield item def parse4(self,response): link=LinkExtractor(restrict_xpaths='//div[@class="net_pop1"]//div[@class="city"]') links=link.extract_links(response) #Get links for all cities for i in links: detailurl=i.url yield scrapy.Request(url=detailurl,callback=self.parse5) def parse4(self, response): item = PuluosiItem() citycode=re.findall('var cities =(.*);',response.text ) citycodejson=json.loads(("".join(citycode))) #Take out the id and name of each city and put them in a dictionary dictcity={} for i in citycodejson: citycodename=i['name'] citycodenm=i['id'] dictcity[citycodenm]=citycodename detail=re.findall('var parks =(.*);',response.text ) jsonBody = json.loads(("".join(detail))) list = [] for key1 in jsonBody: for key2 in jsonBody[key1]: tmp=jsonBody[key1][key2] list.append(jsonBody[key1][key2]) for node in list: assetaddress = node['city_id'] item['assetaddress'] = dictcity[assetaddress] # print(item['assetaddress']) item['assetstitle'] = node['name'] # print(item['assetstitle']) item['assetgaikuang'] = node['detail_single'].strip().replace(' ', '').replace(' ', '') # print(item['assetgaikuang']) assetpeople = node['description'] item['assetpeople'] = re.sub(r'<.*?>', '', (assetpeople.strip())).replace(' ', '') item['asseturl']='https://www.glprop.com.cn/network-city-detail.html?city='+item['assetaddress'] # print(item['assetpeople']) yield item
Then I crawled the news information of the page by the way.