This actual battle is to use reptiles to climb the new house of chainman (statement: the content is only for learning and communication, not for commercial use)
Environmental Science
win8, python 3.7, pycharm
text
1. Target website analysis
Through analysis, find out the relevant url, determine the request mode, whether there is js encryption
2. Create a new sketch project
1. In the cmd command line window, enter the following command to create the lianjia project
scrapy startproject lianjia
2. In cmd, enter lianjia file and create Spider file
cd lianjia
scrapy genspider -t crawl xinfang lianjia.com
This time, we created the crawlespider class, which is applicable to batch crawling web pages
3. Create a new main.py file to execute the summary project file
Now that the project has been created, let's start writing the project
3 define fields
Define the required crawled field information in the items.py file
import scrapy from scrapy.item import Item, Field class LianjiaItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() city = Field() #City name name = Field() #Name of real estate type = Field() #Property type status = Field() #state region = Field() #Respective region street = Field() #Street address = Field() #Specific address area = Field() #The measure of area average_price = Field() #average price total_price = Field() #Total price tags = Field() #Label
4 crawler main program
Write our main crawler program in xinfang.py file
from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from lianjia.items import LianjiaItem class XinfangSpider(CrawlSpider): name = 'xinfang' allowed_domains = ['lianjia.com'] start_urls = ['https://bj.fang.lianjia.com/'] #Define rules for crawling, LinkExtractor Is used to extract links(among,allow Refers to the allowed link format, restrict_xpaths Refers to the position of the link in the page structure), follow by True Indicates the link extracted by follow-up, callback Call function rules = ( Rule(LinkExtractor(allow=r'\.fang.*com/$', restrict_xpaths='//div[@class="footer"]//div[@class="link-list"]/div[2]/dd'), follow=True), Rule(LinkExtractor(allow=r'.*loupan/$', restrict_xpaths='//div[@class="xinfang-all"]/div/a'),callback= 'parse_item', follow=True) ) def parse_item(self, response): '''Request per page url'''' counts = response.xpath('//div[@class="page-box"]/@data-total-count').extract_first() pages = int(counts) // 10 + 2 #Because the maximum number of pages is 100, Conditional judgment if pages > 100: pages = 101 for page in range(1, pages): url = response.url + "pg" + str(page) yield scrapy.Request(url, callback=self.parse_detail, dont_filter=False) def parse_detail(self, response): '''Parsing web content''' item = LianjiaItem() item["title"] = response.xpath('//div[@class="resblock-have-find"]/span[3]/text()').extract_first()[1:] infos = response.xpath('//ul[@class="resblock-list-wrapper"]/li') for info in infos: item["city"] = info.xpath('div/div[1]/a/text()').extract_first() item["type"] = info.xpath('div/div[1]/span[1]/text()').extract_first() item["status"] = info.xpath('div/div[1]/span[2]/text()').extract_first() item["region"] = info.xpath('div/div[2]/span[1]/text()').extract_first() item["street"] = info.xpath('div/div[2]/span[2]/text()').extract_first() item["address"] = info.xpath('div/div[2]/a/text()').extract_first().replace(",", "") item["area"] = info.xpath('div/div[@class="resblock-area"]/span/text()').extract_first() item["average_price"] = "".join(info.xpath('div//div[@class="main-price"]//text()').extract()).replace(" ", "") item["total_price"] = info.xpath('div//div[@class="second"]/text()').extract_first() item["tags"] = ";".join(info.xpath('div//div[@class="resblock-tag"]//text()').extract()).replace(" ","").replace("\n", "") yield item
5 save to Mysql database
Edit the following code in the pipelines.py file
import pymysql class LianjiaPipeline(object): def __init__(self): #Create database connection object self.db = pymysql.connect( host = "localhost", user = "root", password = "1234", port = 3306, db = "lianjia", charset = "utf8" ) self.cursor = self.db.cursor() def process_item(self, item, spider): #Store in database sql = "INSERT INTO xinfang(city, name, type, status, region, street, address, area, average_price, total_price, tags) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)" data = (item["city"], item["name"], item["type"], item["status"], item["region"], item["street"], item["address"], item["area"], item["average_price"], item["total_price"], item["tags"]) try: self.cursor.execute(sql, data) self.db.commit() except: self.db.rollback() finally: return item
6 anti climbing measures
Because it is batch crawling, it is necessary to take some anti crawling measures. I use the free IP agent here. Edit the following code in middlewars.py:
from scrapy import signals import logging import requests class ProxyMiddleware(object): def __init__(self, proxy): self.logger = logging.getLogger(__name__) self.proxy = proxy @classmethod def from_crawler(cls, crawler): '''Get the api Interface''' settings = crawler.settings return cls( proxy=settings.get('RANDOM_PROXY') ) def get_random_proxy(self): '''Get random agent''' try: response = requests.get(self.proxy) if response.status_code == 200: proxy = response.text return proxy except: return False def process_request(self, request, spider): '''Use randomly generated proxy requests''' proxy = self.get_random_proxy() if proxy: url = 'http://' + str(proxy) self.logger.debug('This use of agent'+ proxy) request.meta['proxy'] = url
7 configuration settings file
import random RANDOM_PROXY = "http://localhost:6686/random" BOT_NAME = 'lianjia' SPIDER_MODULES = ['lianjia.spiders'] NEWSPIDER_MODULE = 'lianjia.spiders' ROBOTSTXT_OBEY = False DOWNLOAD_DELAY = random.random()*2 COOKIES_ENABLED = False DEFAULT_REQUEST_HEADERS = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', } DOWNLOADER_MIDDLEWARES = { 'lianjia.middlewares.ProxyMiddleware': 543 } ITEM_PIPELINES = { 'lianjia.pipelines.LianjiaPipeline': 300, }
8 execution of project documents
Execute the following command in mian.py
from scrapy import cmdline cmdline.execute('scrapy crawl xinfang'.split())
The project will start to execute, and finally more than 14000 pieces of data will be crawled