Scrape crawls data using phantom JS

Environment: python2.7 + scratch + selenium + phantomjs

Content: test summary + phantomjs

Crawling content: involving js to load more pages

Principle: open middleware in configuration file + modify process ﹣ request function (add phantom JS operation in it)

Step 1:

settings.py

DOWNLOADER_MIDDLEWARES = {
    'dbdm.middlewares.DbdmSpiderMiddleware': 543,
}

Different names of projects will change without impact.

Step 2:

----------Enable phantom JS by default

middlewares.py

It needs to be loaded selenium 
from selenium import webdriver
#Some codes are omitted
@classmethod
def process_request(cls, request, spider): #if request.meta.has_key('PhantomJS'): driver = webdriver.PhantomJS('E:\\p_python\\Scripts\\phantomjs\\bin\\phantomjs.exe') driver.get(request.url) if request.url=='https://movie.douban.com/tag': driver.find_element_by_xpath('//*[@id="app"]/div/div[1]/div[1]/ul[1]/li[5]/span').click() time.sleep(5) if driver.find_element_by_xpath('//*[@id="app"]/div/div[1]/a'): click_more(driver) content = driver.page_source.encode('utf-8') #print content #file = open(path.join(d, '1.txt'),'w') #file.write(content) #file.close() driver.quit() return HtmlResponse(request.url, encoding='utf-8', body=content, request=request)

def click_more(driver,i=1):
    driver.find_element_by_xpath('//*[@id="app"]/div/div[1]/a').click()
    print str(i)+'  click'
    time.sleep(5)
    i = i+1
    try:
        more_btn = driver.find_element_by_xpath('//*[@id="app"]/div/div[1]/a')
        if more_btn:
            click_more(driver,i)
    except:
        print 'click Over!!'

 

The above is just the test code. According to your project changes, the current default is to open phantom JS to access the url, which can be judged.

-----------Turn it on again when it needs to be turned on

Judge the value of key


It needs to be loaded selenium 
from selenium import webdriver
#Some codes are omitted 

@classmethod
def process_request(cls, request, spider): if request.meta.has_key('PhantomJS'): driver = webdriver.PhantomJS('E:\\p_python\\Scripts\\phantomjs\\bin\\phantomjs.exe') driver.get(request.url) content = driver.page_source.encode('utf-8') driver.quit() return HtmlResponse(request.url, encoding='utf-8', body=content, request=request)

The key value is set in the spider file

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from phantomjs_test.items import PhantomscrapyItem

class PhantomjsTestSpider(CrawlSpider):
    name = 'phantomjs_test'
    allowed_domains = ['book.com']
    start_urls = ['http://book.com/']
    #all_urls = []   It doesn't seem necessary to de weight
     rules = (
        ###Get all page lists
        Rule(LinkExtractor(allow=r'/story/p/[2-9]*'), callback='parse', follow=True),
        ###Get all the details in it
        #Rule(LinkExtractor(allow=r'/detail/p/[2-9]*'), callback = 'parse_item',follow=True),
    )

    ###Get all the article URLs from the paging page
    def parse(self, response):
        url_list = response.xpath('/a/@href').extract()
        for url in url_list:
            request = Request(url=url, callback=self.parse_item, dont_filter=True)
            request.meta['PhantomJS'] = True
            yield request

    def parse_item(self, response):
        item = PhantomscrapyItem()
        #i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
        #i['name'] = response.xpath('//div[@id="name"]').extract()
        #i['description'] = response.xpath('//div[@id="description"]').extract()
        item['bookName'] = response.xpath()
        items = []
        items.append(item)
        return items

The above is the difference between opening by default and reopening by judging conditions. It can be set according to different pages, and the code still needs to be improved to be humanized.

Keywords: Python Selenium encoding

Added by skylert on Fri, 01 May 2020 14:58:08 +0300