Scrape crawls data using phantom JS
Environment: python2.7 + scratch + selenium + phantomjs
Content: test summary + phantomjs
Crawling content: involving js to load more pages
Principle: open middleware in configuration file + modify process ﹣ request function (add phantom JS operation in it)
Step 1:
settings.py
DOWNLOADER_MIDDLEWARES = {
'dbdm.middlewares.DbdmSpiderMiddleware': 543,
}
Different names of projects will change without impact.
Step 2:
----------Enable phantom JS by default
middlewares.py
It needs to be loaded selenium
from selenium import webdriver
#Some codes are omitted
@classmethod
def process_request(cls, request, spider):
#if request.meta.has_key('PhantomJS'):
driver = webdriver.PhantomJS('E:\\p_python\\Scripts\\phantomjs\\bin\\phantomjs.exe')
driver.get(request.url)
if request.url=='https://movie.douban.com/tag':
driver.find_element_by_xpath('//*[@id="app"]/div/div[1]/div[1]/ul[1]/li[5]/span').click()
time.sleep(5)
if driver.find_element_by_xpath('//*[@id="app"]/div/div[1]/a'):
click_more(driver)
content = driver.page_source.encode('utf-8')
#print content
#file = open(path.join(d, '1.txt'),'w')
#file.write(content)
#file.close()
driver.quit()
return HtmlResponse(request.url, encoding='utf-8', body=content, request=request)
def click_more(driver,i=1):
driver.find_element_by_xpath('//*[@id="app"]/div/div[1]/a').click()
print str(i)+' click'
time.sleep(5)
i = i+1
try:
more_btn = driver.find_element_by_xpath('//*[@id="app"]/div/div[1]/a')
if more_btn:
click_more(driver,i)
except:
print 'click Over!!'
The above is just the test code. According to your project changes, the current default is to open phantom JS to access the url, which can be judged.
-----------Turn it on again when it needs to be turned on
Judge the value of key
It needs to be loaded selenium
from selenium import webdriver
#Some codes are omitted
@classmethod
def process_request(cls, request, spider):
if request.meta.has_key('PhantomJS'):
driver = webdriver.PhantomJS('E:\\p_python\\Scripts\\phantomjs\\bin\\phantomjs.exe')
driver.get(request.url)
content = driver.page_source.encode('utf-8')
driver.quit()
return HtmlResponse(request.url, encoding='utf-8', body=content, request=request)
The key value is set in the spider file
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from phantomjs_test.items import PhantomscrapyItem
class PhantomjsTestSpider(CrawlSpider):
name = 'phantomjs_test'
allowed_domains = ['book.com']
start_urls = ['http://book.com/']
#all_urls = [] It doesn't seem necessary to de weight
rules = (
###Get all page lists
Rule(LinkExtractor(allow=r'/story/p/[2-9]*'), callback='parse', follow=True),
###Get all the details in it
#Rule(LinkExtractor(allow=r'/detail/p/[2-9]*'), callback = 'parse_item',follow=True),
)
###Get all the article URLs from the paging page
def parse(self, response):
url_list = response.xpath('/a/@href').extract()
for url in url_list:
request = Request(url=url, callback=self.parse_item, dont_filter=True)
request.meta['PhantomJS'] = True
yield request
def parse_item(self, response):
item = PhantomscrapyItem()
#i['domain_id'] = response.xpath('//input[@id="sid"]/@value').extract()
#i['name'] = response.xpath('//div[@id="name"]').extract()
#i['description'] = response.xpath('//div[@id="description"]').extract()
item['bookName'] = response.xpath()
items = []
items.append(item)
return items
The above is the difference between opening by default and reopening by judging conditions. It can be set according to different pages, and the code still needs to be improved to be humanized.