From f1078e4837b9f6df4e727af35f63ed3d69b40a64 Mon Sep 17 00:00:00 2001
From: RenMeng <853750873@qq.com>
Date: Fri, 25 Oct 2019 17:41:29 +0800
Subject: [PATCH 1/2] scrapy trulia for-sale houses

---
 README.md                             |  35 +---
 trulia_scraper/items.py               | 173 ++++++++++------
 trulia_scraper/middlewares.py         |  27 +++
 trulia_scraper/parsing.py             |  15 +-
 trulia_scraper/pipelines.py           |   5 +-
 trulia_scraper/settings.py            |   8 +
 trulia_scraper/spiders/trulia.py      | 273 +++++++++++++++++++++-----
 trulia_scraper/spiders/trulia_sold.py |  98 ++++-----
 utils.py                              |  18 ++
 9 files changed, 460 insertions(+), 192 deletions(-)
 create mode 100644 utils.py

diff --git a/README.md b/README.md
index 8783c57..5ef7732 100644
--- a/README.md
+++ b/README.md
@@ -1,35 +1,8 @@
-# trulia-scraper
-Scraper for real estate listings on [Trulia.com](https://www.trulia.com/) implemented in Python with Scrapy.
+# Update trulia-scraper
+Details please refer to [khpeek](https://github.com/khpeek/trulia-scraper)
 
 ## Basic usage
-To crawl the scraper, you need to install [Python 3](https://www.python.org/download/releases/3.0/), as well as the [Scrapy](https://pypi.python.org/pypi/Scrapy) framework and the [Pyparsing](https://pypi.python.org/pypi/pyparsing/2.2.0) module. The scraper features two spiders:
-
 1. `trulia`, which scrapes all real estate listings which are _for sale_ in a given state and city starting from a URL such as [https://www.trulia.com/CA/San_Francisco/](https://www.trulia.com/CA/San_Francisco/);
-2. `trulia_sold`, which similarly scrapes listings of recently _sold_ properties starting from a URL such as [https://www.trulia.com/sold/San_Francisco,CA/](https://www.trulia.com/sold/San_Francisco,CA/).
-
-To crawl the `trulia_sold` spider for the state of `CA` and city of `San_Francisco` (the default locale), simply run the command
-
-```
-scrapy crawl trulia_sold
-```
-from the project directory. To scrape listings for another city, specify the `city` and `state` arguments using the `-a` flag. For example,
-
-```
-scrapy crawl trulia_sold -a state=NY -a city=New_York
-```
-will scrape all listings reachable from [https://www.trulia.com/sold/New_York,NY/](https://www.trulia.com/sold/New_York,NY/).
-
-By default, the scraped data will be stored (using Scrapy's [feed export](https://doc.scrapy.org/en/latest/topics/feed-exports.html)) in the `data` directory as a [JSON lines](http://jsonlines.org/) (`.jl`) file following the naming convention
-
-```
-data_{sold|for_sale}_{state}_{city}_{time}.jl
-```
-
-where `{sold|for_sale}` is `sold` or `for_sale` for the `trulia` and `trulia_sold` spiders, respectively, `{state}` and `{city}` are the specified state and city (e.g. `CA` and `San_Francisco`, respectively), and `{time}` represents the current UTC time.
-
-If you prefer a different output file name and format, you can specify this from the command line using Scrapy's `-o` option. For example,
 
-```
-scrapy crawl trulia_sold -a state=WA -city=Seattle -o data_Seattle.csv
-```
-will output the data in CSV format as `data_Seattle.csv`. (Scrapy automatically picks up the file format from the specified file extension).
\ No newline at end of file
+## To do
+1. `trulia_sold`, which similarly scrapes listings of recently _sold_ properties starting from a URL such as [https://www.trulia.com/sold/San_Francisco,CA/](https://www.trulia.com/sold/San_Francisco,CA/).
\ No newline at end of file
diff --git a/trulia_scraper/items.py b/trulia_scraper/items.py
index 6b7ac3d..8470809 100644
--- a/trulia_scraper/items.py
+++ b/trulia_scraper/items.py
@@ -1,70 +1,129 @@
 # -*- coding: utf-8 -*-
-from scrapy.loader import ItemLoader
-from scrapy.loader.processors import TakeFirst, MapCompose, Identity, Compose
+from scrapy.loader.processors import TakeFirst, Identity, Compose, Join
 import scrapy
-from trulia_scraper.parsing import remove_empty, get_number_from_string
+from trulia_scraper.parsing import *
 
+class overview_item(scrapy.Item):
+    url = scrapy.Field(
+        output_processor=Compose(TakeFirst())
+    )
+    address = scrapy.Field(
+        output_processor=Compose(TakeFirst())
+    )
+    city_state = scrapy.Field(
+        output_processor=Compose(TakeFirst())
+    )
+    price = scrapy.Field(
+        output_processor=Compose(TakeFirst(),get_number_from_string)
+    )  # for items on sale only
+    area = scrapy.Field(
+        output_processor=Compose(TakeFirst(), get_number_from_string)
+    )
+    bedrooms = scrapy.Field(
+        output_processor=Compose(TakeFirst(), float)
+    )
+    bathrooms = scrapy.Field(
+        output_processor= Compose(TakeFirst(), float)
+    )
+    year_built = scrapy.Field(
+        output_processor=Compose(TakeFirst(), int)
+    )
+    lot_size = scrapy.Field(
+        output_processor=Compose(TakeFirst(), get_number_from_string)
+    )
+    lot_size_units = scrapy.Field(
+        output_processor=Compose(TakeFirst())
+    )
+    price_per_square_foot = scrapy.Field(
+        output_processor=Compose(TakeFirst(), get_number_from_string)
+    )
+    days_on_Trulia = scrapy.Field(
+        output_processor=Compose(TakeFirst(), int)
+    )
 
-class TruliaItem(scrapy.Item):
-    url = scrapy.Field()
-    address = scrapy.Field()
-    city_state = scrapy.Field()
-    price = scrapy.Field()              # for items on sale only
-    neighborhood = scrapy.Field()
-    overview = scrapy.Field()
-    description = scrapy.Field()
+class basic_info_item(scrapy.Item):
+    url = scrapy.Field(
+        output_processor=Compose(TakeFirst())
+    )
+    address = scrapy.Field(
+        output_processor=Compose(TakeFirst())
+    )
+    city_state = scrapy.Field(
+        output_processor=Compose(TakeFirst())
+    )
+    price = scrapy.Field(
+        output_processor=Compose(TakeFirst(), get_number_from_string)
+    )  # for items on sale only
+    area = scrapy.Field(
+        output_processor=Compose(TakeFirst(), get_number_from_string)
+    )
+    bedrooms = scrapy.Field(
+        output_processor=Compose(TakeFirst(), float)
+    )
+    bathrooms = scrapy.Field(
+        output_processor=Compose(TakeFirst(), float)
+    )
 
-    # Columns from the 'price events' table are stored in separate lists
-    prices = scrapy.Field()
-    dates = scrapy.Field()
-    events = scrapy.Field()
 
-    # Property tax information is on 'sold' pages only
-    property_tax_assessment_year = scrapy.Field()
-    property_tax = scrapy.Field()
-    property_tax_assessment_land = scrapy.Field()
-    property_tax_assessment_improvements = scrapy.Field()
-    property_tax_assessment_total = scrapy.Field()
-    property_tax_market_value = scrapy.Field()
+class price_item(scrapy.Item):
+    prices = scrapy.Field(
+        output_processor= Identity()
+    )
+    dates = scrapy.Field(
+        output_processor= Compose(remove_empty)
+    )
+    events = scrapy.Field(
+        output_processor= Compose(remove_empty)
+    )
 
-    # The 'Features' sections is on 'for sale' pages only
-    listing_information = scrapy.Field()
-    listing_information_date_updated = scrapy.Field()
-    public_records = scrapy.Field()
-    public_records_date_updated = scrapy.Field()
+class taxes_item(scrapy.Item):
+    property_tax_assessment_year = scrapy.Field(
+        output_processor=Compose(TakeFirst(), int)
+    )
+    property_tax = scrapy.Field(
+        output_processor=Compose(TakeFirst(), get_number_from_string)
+    )
+    property_tax_assessment_land = scrapy.Field(
+        output_processor=Compose(TakeFirst(), get_number_from_string)
+    )
+    property_tax_assessment_improvements = scrapy.Field(
+        output_processor=Compose(TakeFirst(), get_number_from_string)
+    )
+    property_tax_assessment_total = scrapy.Field(
+        output_processor=Compose(TakeFirst(), get_number_from_string)
+    )
+
+class price_trends_item(scrapy.Item):
+    item1 = scrapy.Field(
+        output_processor=Compose(Join())
+    )
+    item2 = scrapy.Field(
+        output_processor=Compose(Join())
+    )
+    item3 = scrapy.Field(
+        output_processor=Compose(Join())
+    )
 
-    # Items generated from further parsing of 'raw' scraped data
-    area = scrapy.Field()
-    lot_size = scrapy.Field()
-    lot_size_units = scrapy.Field()
-    price_per_square_foot = scrapy.Field()      # For properties on sale only
-    bedrooms = scrapy.Field()
-    bathrooms = scrapy.Field()
-    year_built = scrapy.Field()
-    days_on_Trulia = scrapy.Field()
-    views = scrapy.Field()
-    price_history = scrapy.Field()
 
+class TruliaItem(scrapy.Item):
+    overview = scrapy.Field()
+    local_information = scrapy.Field() #todo
+
+    description = scrapy.Field()
+    community_description = scrapy.Field()
+    home_detail = scrapy.Field()
+    office_hours = scrapy.Field()
+    open_house = scrapy.Field()
+
+    price_history = scrapy.Field()
+    similar_homes = scrapy.Field() #todo
+    new_listing = scrapy.Field() #todo
 
-class TruliaItemLoader(ItemLoader):
-    default_input_processor = MapCompose(str.strip)
-    default_output_processor = TakeFirst()
+    property_taxes = scrapy.Field()
+    price_trends = scrapy.Field()
+    comparable_sales = scrapy.Field()
 
-    price_out = Compose(TakeFirst(), lambda s: int(s.replace(',', '')))
-    overview_out = Identity()
-    description_out = Compose(remove_empty)
-    prices_out = Identity()
-    dates_out = Compose(remove_empty)
-    events_out = Compose(remove_empty)
+    local_commons = scrapy.Field() #todo
+    new_homes = scrapy.Field()
 
-    listing_information_out = Identity()
-    public_records_out = Identity()
 
-    area_out = Compose(TakeFirst(), get_number_from_string)
-    lot_size_out = Compose(TakeFirst(), get_number_from_string)
-    price_per_square_foot_out = Compose(TakeFirst(), get_number_from_string)
-    bedrooms_out = Compose(TakeFirst(), int)
-    bathrooms_out = Compose(TakeFirst(), int)
-    year_built_out = Compose(TakeFirst(), int)
-    days_on_Trulia_out = Compose(TakeFirst(), lambda s: int(s.replace(',', '')))
-    views_out = Compose(TakeFirst(), lambda s: int(s.replace(',', '')))
diff --git a/trulia_scraper/middlewares.py b/trulia_scraper/middlewares.py
index ee550c4..72f9211 100644
--- a/trulia_scraper/middlewares.py
+++ b/trulia_scraper/middlewares.py
@@ -54,3 +54,30 @@ def process_start_requests(self, start_requests, spider):
 
     def spider_opened(self, spider):
         spider.logger.info('Spider opened: %s' % spider.name)
+
+
+# class CustomProxyMiddleware(object):
+#     def process_request(self, request, spider):
+#         request.meta['proxy'] = "http://127.0.0.1:8118"
+
+# http agents
+http_agents = [
+    "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
+    "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
+    "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
+    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
+    "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
+    "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
+    "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
+    "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
+    "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:66.0) Gecko/20100101 Firefox/66.0",
+]
+
+def get_agent():
+    import random
+    return random.choice(http_agents)
+
+
+class UserAgentMiddleware(object):
+    def process_request(self, request, spider):
+        request.headers['User-Agent'] = get_agent()
diff --git a/trulia_scraper/parsing.py b/trulia_scraper/parsing.py
index 1ec9e2a..64f0964 100644
--- a/trulia_scraper/parsing.py
+++ b/trulia_scraper/parsing.py
@@ -1,7 +1,20 @@
+import re
+KEYS = set(['schools', 'crime', 'commute', 'shop & eat'])
 def remove_empty(l):
     '''Remove items which evaluate to False (such as empty strings) from the input list.'''
     return [x for x in l if x]
 
+def remove_key_words(l):
+    return [x for x in l if x.lower() not in KEYS]
+
 def get_number_from_string(string, number_type=float):
     '''Remove commas from the input string and parse as a number'''
-    return number_type(string.replace(',', ''))
\ No newline at end of file
+    string = string.replace('$', '')
+    return number_type(string.replace(',', ''))
+
+def match_quote(l):
+    result = re.findall('.*\"(.*)\".*', l)
+    if len(result) > 0:
+        return result[0]
+    else:
+        return ''
\ No newline at end of file
diff --git a/trulia_scraper/pipelines.py b/trulia_scraper/pipelines.py
index 4714fd0..e316d6e 100644
--- a/trulia_scraper/pipelines.py
+++ b/trulia_scraper/pipelines.py
@@ -4,8 +4,9 @@
 #
 # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
-
+from scrapy import signals
+from scrapy.exporters import CsvItemExporter
 
 class TruliaScraperPipeline(object):
     def process_item(self, item, spider):
-        return item
+        return item
\ No newline at end of file
diff --git a/trulia_scraper/settings.py b/trulia_scraper/settings.py
index 0f337dd..1aaa840 100644
--- a/trulia_scraper/settings.py
+++ b/trulia_scraper/settings.py
@@ -17,6 +17,8 @@
 
 # Crawl responsibly by identifying yourself (and your website) on the user-agent
 #USER_AGENT = 'trulia_scraper (+http://www.yourdomain.com)'
+# USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
+
 
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = True
@@ -88,3 +90,9 @@
 #HTTPCACHE_DIR = 'httpcache'
 #HTTPCACHE_IGNORE_HTTP_CODES = []
 #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
+
+DOWNLOADER_MIDDLEWARES = {
+    # 'trulia_scraper.middlewares.CustomProxyMiddleware': 10,
+    # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 20,
+    'trulia_scraper.middlewares.UserAgentMiddleware': 543,
+}
\ No newline at end of file
diff --git a/trulia_scraper/spiders/trulia.py b/trulia_scraper/spiders/trulia.py
index 741152f..974f5a6 100644
--- a/trulia_scraper/spiders/trulia.py
+++ b/trulia_scraper/spiders/trulia.py
@@ -1,28 +1,43 @@
 # -*- coding: utf-8 -*-
 import os
+import re
 import scrapy
 import math
 import datetime
 from scrapy.linkextractors import LinkExtractor
-from trulia_scraper.items import TruliaItem, TruliaItemLoader
+from trulia_scraper.items import *
 from trulia_scraper.parsing import get_number_from_string
 from scrapy.utils.conf import closest_scrapy_cfg
+from scrapy.loader import ItemLoader
+from utils import get_rel_url
+from itertools import zip_longest
+
 
 
 class TruliaSpider(scrapy.Spider):
+    cols = ['overview', 'local_information', 'description', 'home_detail',
+            'community_description', 'office_hours', 'open_house',
+            'price_history', 'price_trends', 'property_taxes',
+            'new_homes', 'similar_homes', 'new_listing', 'comparable_sales',
+            'local_commons']
+
     name = 'trulia'
     allowed_domains = ['trulia.com']
     custom_settings = {'FEED_URI': os.path.join(os.path.dirname(closest_scrapy_cfg()), 'data/data_for_sale_%(state)s_%(city)s_%(time)s.jl'), 
-                       'FEED_FORMAT': 'jsonlines'}
+                       'FEED_FORMAT': 'jsonlines',
+                       'FEED_EXPORT_FIELDS': cols}
 
-    def __init__(self, state='CA', city='San_Francisco', *args, **kwargs):
+    def __init__(self, state='CA', city='Mountain_View', *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.state = state
         self.city = city
+        # self.start_urls = ['http://trulia.com/{state}/{city}'.format(state=state, city=city)]
         self.start_urls = ['http://trulia.com/{state}/{city}'.format(state=state, city=city)]
-        self.le = LinkExtractor(allow=r'^https://www.trulia.com/property')
+        # self.le = LinkExtractor(allow=[r'^https://www.trulia.com/p/ca', r'^https://www.trulia.com/property', r'^https://www.trulia.com/builder-community'])
+        self.link_path = '//div[@data-testid="search-result-list-container"]//a/@href'
 
     def parse(self, response):
+        print(response.url, response.status)
         N = self.get_number_of_pages_to_scrape(response)
         self.logger.info("Determined that property pages are contained on {N} different index pages, each containing at most 30 properties. Proceeding to scrape each index page...".format(N=N))
         for url in [response.urljoin("{n}_p/".format(n=n)) for n in range(1, N+1)]:
@@ -30,64 +45,218 @@ def parse(self, response):
 
     @staticmethod
     def get_number_of_pages_to_scrape(response):
-        pagination = response.css('.paginationContainer').xpath('.//*/text()[contains(., "Results")]')
-        number_of_results = int(pagination.re_first(r'^1 - 30 of ([\d,]+) Results$').replace(',', ''))
+        pagination = response.xpath(
+            '//div[@data-testid="pagination-caption"]/text()').extract()
+        if len(pagination) > 0:
+            pagination = pagination[0]
+        pattern = '^1-30 of ([\d,]+) Results$'
+        number_of_results = int(re.findall(pattern, pagination)[0].replace(',', ''))
+        print(number_of_results)
         return math.ceil(number_of_results/30)
 
     def parse_index_page(self, response):
-        for link in self.le.extract_links(response):
-            yield scrapy.Request(url=link.url, callback=self.parse_property_page)
+        # print(len(self.le.extract_links(response)))
+        # for link in self.le.extract_links(response):
+        #     yield scrapy.Request(url=link.url, callback=self.parse_property_page)
+        for url in response.xpath(self.link_path).extract():
+            yield scrapy.Request(url=get_rel_url(response.url, url), callback=self.parse_property_page)
 
     def parse_property_page(self, response):
-        l = TruliaItemLoader(item=TruliaItem(), response=response)
-        self.load_common_fields(item_loader=l, response=response)
+        # overview
+        il = ItemLoader(item=overview_item(), response=response)
+        il.add_value('url', response.url)
+        overview_node = il.nested_xpath('//div[@data-testid="home-details-summary-container"]')
+        overview_node.add_xpath('address', './/span[@data-testid="home-details-summary-headline"]/text()')
+        overview_node.add_xpath('city_state',
+                     './/span[@data-testid="home-details-summary-city-state"]/text()')
+        overview_node.add_xpath('price',
+                     './/*[@data-testid="on-market-price-details"]//text()',
+                     re=r'\$([\d,]+)')
+        overview_node.add_xpath('area', xpath='.//li//text()', re=r'^([\d,]+)\s?sqft$')
+        overview_node.add_xpath('bedrooms', xpath='.//li//text()', re=r'(\d+\.?\d?) (?:Beds|Bed|beds|bed)$')
+        overview_node.add_xpath('bathrooms', xpath='.//li//text()', re=r'(\d+\.?\d?) (?:Baths|Bath|baths|bath)$')
 
-        listing_information = l.nested_xpath('//span[text() = "LISTING INFORMATION"]')
-        listing_information.add_xpath('listing_information', './parent::div/following-sibling::ul[1]/li/text()')
-        listing_information.add_xpath('listing_information_date_updated', './following-sibling::span/text()', re=r'^Updated: (.*)')
+        details = il.nested_xpath('//div[@data-testid="features-container"]')
+        details.add_xpath('year_built', xpath='.//li//text()', re='Built in (\d+)')
+        details.add_xpath('lot_size', xpath='.//li//text()', re=r'Lot Size: ([\d,.]+) (?:acres|sqft)$')
+        details.add_xpath('lot_size_units', xpath='.//li//text()', re=r'Lot Size: [\d,.]+ (acres|sqft)$')
+        details.add_xpath('price_per_square_foot', xpath='.//li//text()', re=r'\$([\d,.]+)/sqft$')
+        details.add_xpath('days_on_Trulia', xpath='.//li//text()', re=r'([\d,]+)\+? Days on Trulia$')
+        overview_dict = il.load_item()
 
-        public_records = l.nested_xpath('//span[text() = "PUBLIC RECORDS"]')
-        public_records.add_xpath('public_records', './parent::div/following-sibling::ul[1]/li/text()')
-        public_records.add_xpath('public_records_date_updated', './following-sibling::span/text()', re=r'^Updated: (.*)')
+        # local info
+        local_info_list = response.xpath(
+            '(//*[div="Local Information"]/parent::div)[2]/following-sibling::div/div/div//text()').extract()
+        # for i in range(len(local_info_list) - 1, -1, -1):
+        #     if "Map View" in local_info_list[i] or "Street View" in local_info_list[i]:
+        #         local_info_list.remove(local_info_list[i])
+        local_dict_values = '\n'.join(local_info_list)
 
-        item = l.load_item()
-        self.post_process(item=item)
-        return item
+        # price_history
+        il = ItemLoader(item=price_item(), response=response)
+        table_xpath = '//div[contains(text(), "Price History for")]/../../following-sibling::table'
+        il.add_xpath('dates', table_xpath + '//tr[1]/td[1]//text()')
+        il.add_xpath('prices', table_xpath + '//tr[1]/td[2]//text()')
+        il.add_xpath('events', table_xpath + '//tr[1]/td[3]//text()')
+        price_dict = il.load_item()
 
-    @staticmethod
-    def load_common_fields(item_loader, response):
-        '''Load field values which are common to "on sale" and "recently sold" properties.'''
-        item_loader.add_value('url', response.url)
-        item_loader.add_xpath('address', '//*[@data-role="address"]/text()')
-        item_loader.add_xpath('city_state', '//*[@data-role="cityState"]/text()')
-        item_loader.add_xpath('price', '//span[@data-role="price"]/text()', re=r'\$([\d,]+)')
-        item_loader.add_xpath('neighborhood', '//*[@data-role="cityState"]/parent::h1/following-sibling::span/a/text()')
-        details = item_loader.nested_css('.homeDetailsHeading')
-        overview = details.nested_xpath('.//span[contains(text(), "Overview")]/parent::div/following-sibling::div[1]')
-        overview.add_xpath('overview', xpath='.//li/text()')
-        overview.add_xpath('area', xpath='.//li/text()', re=r'([\d,]+) sqft$')
-        overview.add_xpath('lot_size', xpath='.//li/text()', re=r'([\d,.]+) (?:acres|sqft) lot size$')
-        overview.add_xpath('lot_size_units', xpath='.//li/text()', re=r'[\d,.]+ (acres|sqft) lot size$')
-        overview.add_xpath('price_per_square_foot', xpath='.//li/text()', re=r'\$([\d,.]+)/sqft$')
-        overview.add_xpath('bedrooms', xpath='.//li/text()', re=r'(\d+) (?:Beds|Bed|beds|bed)$')
-        overview.add_xpath('bathrooms', xpath='.//li/text()', re=r'(\d+) (?:Baths|Bath|baths|bath)$')
-        overview.add_xpath('year_built', xpath='.//li/text()', re=r'Built in (\d+)')
-        overview.add_xpath('days_on_Trulia', xpath='.//li/text()', re=r'([\d,]) days on Trulia$')
-        overview.add_xpath('views', xpath='.//li/text()', re=r'([\d,]+) views$')
-        item_loader.add_css('description', '#descriptionContainer *::text')
-
-        price_events = details.nested_xpath('.//*[text() = "Price History"]/parent::*/following-sibling::*[1]/div/div')
-        price_events.add_xpath('prices', './div[contains(text(), "$")]/text()')
-        price_events.add_xpath('dates', './div[contains(text(), "$")]/preceding-sibling::div/text()')
-        price_events.add_xpath('events', './div[contains(text(), "$")]/following-sibling::div/text()')
 
-    @staticmethod
-    def post_process(item):
-        '''Add any additional data to an item after loading it'''
-        if item.get('dates') is not None:
-            dates = [datetime.datetime.strptime(date, '%m/%d/%Y') for date in item['dates']]
-            prices = [int(price.lstrip('$').replace(',', '')) for price in item['prices']]
-            item['price_history'] = sorted(list(zip(dates, prices, item['events'])), key=lambda x: x[0])
+        # tax info
+        il = ItemLoader(item=taxes_item(), response=response)
+        table_xpath = '//*[div="Property Taxes and Assessment"]/parent::div/following-sibling::table'
+        il.add_xpath('property_tax_assessment_year', table_xpath + '//tr[1]/td[1]//text()')
+        il.add_xpath('property_tax', table_xpath + '//tr[2]/td[1]//text()')
+        il.add_xpath('property_tax_assessment_land', table_xpath + '//tr[4]/td[1]//text()')
+        il.add_xpath('property_tax_assessment_improvements', table_xpath + '//tr[5]/td[1]//text()')
+        il.add_xpath('property_tax_assessment_total', table_xpath + '//tr[6]/td[1]//text()')
+        tax_dict = il.load_item()
+
+        # 有的“可比较”模块不存在
+        comparable_path = '//div[contains(text(), "Comparable Sales")]/../../following-sibling::div[3]'
+        header = response.xpath(comparable_path + '//th//text()').extract()
+        header.append('url')
+        num_tr = len(response.xpath(comparable_path + '//tbody/tr'))
+        rows = []
+        for i in range(1, num_tr+1):
+            rows.append(response.xpath((comparable_path + '//tbody/tr[{:d}]//text()').format(i)).extract())
+        urls = response.xpath(comparable_path + '//tbody//a/@href').extract()
+        urls = [get_rel_url(response.url, url) for url in urls]
+        [rows[i].append(urls[i]) for i in range(num_tr)]
+        comparable_list = [list(zip(header, row)) for row in rows]
+
+        # price_trends
+        il = ItemLoader(item=price_trends_item(), response=response)
+        price_trend_node = il.nested_xpath('//*[div="Price Trends"]/parent::div/following-sibling::div[1]')
+        price_trend_node.add_xpath('item1', './*[3]//text()')
+        price_trend_node.add_xpath('item2', './*[4]//text()')
+        price_trend_node.add_xpath('item3', './*[5]//text()')
+        price_trends_dict = il.load_item()
+        price_trends = '\n'.join(list(price_trends_dict.values()))
+
+        # local common
+        total_reviews = []
+        reviews = []
+        review_count = response.xpath('count(//div[@data-testid="wls-responisve-slider"]/div/div/child::node())').extract()[0]
+        review_count = int(float(review_count))
+        for i in range(1, 1 + review_count):
+            reviews.append(' '.join(
+                response.xpath('//div[@data-testid="wls-responisve-slider"]/div/div/*[{:d}]//text()'.format(i)).extract()))
+        reviews = '\n'.join(reviews)
+        common_count = response.xpath('count(//div[@data-testid="what-locals-say"]/child::node())').extract()[0]
+        common_count = int(float(common_count))
+        for i in range(1, common_count):
+            total_reviews.append(' '.join(
+                response.xpath('//div[@data-testid="what-locals-say"]/*[{:d}]//text()'.format(i)).extract()))
+        total_reviews.append(reviews)
+
+        #similar_house
+        base_xpath = '//*[div="Similar Homes You May Like"]/parent::div/following-sibling::div[1]/div/div'
+        similar_house = self.get_similar_new_part(base_xpath, response)
+
+        # new linking house
+        base_xpath = '//div[contains(text(), "New Listings near")]/../../following-sibling::div[1]/div/div'
+        new_link_house = self.get_similar_new_part(base_xpath, response)
+
+        # all new homes
+        builder_tr_count = response.xpath('count(//table[@data-testid="quick-movein-builder-homes-table"]//tr)').extract()[0]
+        builder_tr_count = int(float(builder_tr_count))
+        builder_tables = []
+        for i in range(1, 1 + builder_tr_count):
+            builder_tables.append(response.xpath(
+                '//table[@data-testid="quick-movein-builder-homes-table"]//tr[{:d}]/td//text()'.format(i)).extract())
+
+        builder_plans = []
+        for i in range(1, 1 + builder_tr_count):
+            builder_plans.append(response.xpath(
+                '//table[@data-testid="planned-builder-homes-table"]//tr[{:d}]/td//text()'.format(i)).extract())
+
+        new_homes = {}
+        if len(builder_tables) > 0:
+            new_homes['quick-movein-builder'] = builder_tables
+        if len(builder_plans) > 0:
+            new_homes['planned-builder'] = builder_plans
+
+
+
+        il = ItemLoader(item=TruliaItem(), response=response)
+        # home detail
+        il.add_xpath('home_detail',
+                              '//div[contains(text(), "Home Details for")]/../../following-sibling::ul/li//text()')
+
+        # description
+        il.add_xpath('description',
+                     '(//*[div="Description"]/parent::div)[2]/following-sibling::div//text()')
+
+        il.add_xpath('community_description',
+                     '//div[@data-testid="community-description-text-description-text"]//text()')
 
+        il.add_xpath('office_hours',
+                     '//div[@data-testid="office-hours-container"]//text()')
 
+        il.add_xpath('open_house',
+                     '//div[@data-testid="open-house-container"]//text()')
 
+        # local_commons
+
+        item = il.load_item()
+
+        # price_history may not exist
+        try:
+            dates = [datetime.datetime.strptime(date, '%m/%d/%Y') for date in price_dict['dates']]
+            prices = [int(price.lstrip('$').replace(',', '')) for price in price_dict['prices']]
+            item['price_history'] = sorted(list(zip(dates, prices, price_dict['events'])), key=lambda x: x[0])
+        except:
+            item['price_history'] = []
+
+        # overview
+        item['overview'] = overview_dict
+
+        # property_tax may not exist
+        item['property_taxes'] = tax_dict
+
+        #local_view
+        item['local_information'] = local_dict_values
+
+        # price_trends
+        item['price_trends'] = price_trends
+
+        # comparable_sales
+        item['comparable_sales'] = comparable_list
+
+        # local_commons
+        item['local_commons'] = total_reviews
+
+        # similar house
+        item['similar_homes'] = similar_house
+
+        # new_link house
+        item['new_listing'] = new_link_house
+
+        # new homes
+        item['new_homes'] = new_homes
+        return item
+
+    @staticmethod
+    def get_similar_new_part(base_xpath, response):
+        result_list = []
+        child_num = len(response.xpath(base_xpath + '/child::node()'))
+        for i in range(1, child_num):
+            il = ItemLoader(item=basic_info_item(), response=response)
+            result_list.append(TruliaSpider.get_basic_house_info(il, str(i), response.url,
+                                                           base_xpath))
+        return result_list
+
+
+    @staticmethod
+    def get_basic_house_info(item_loader, nth, url, base_xpath):
+        basic_info = item_loader.nested_xpath(base_xpath)
+        basic_info.add_xpath('price', xpath='./*[{}]//text()'.format(nth), re=r'\$([\d,]+)')
+        basic_info.add_xpath('area', xpath='./*[{}]//text()'.format(nth), re=r'^([\d,]+)\s?sqft$')
+        basic_info.add_xpath('bedrooms', xpath='./*[{}]//text()'.format(nth), re=r'(\d+)\s?(?:Beds|Bed|beds|bed|bd)$')
+        basic_info.add_xpath('bathrooms', xpath='./*[{}]//text()'.format(nth), re=r'(\d+\.?\d{0,})\s?(?:Baths|Bath|baths|bath|ba)$')
+        basic_info.add_xpath('city_state', xpath='./*[{}]//div[@data-testid="property-street"]/text()'.format(nth))
+        basic_info.add_xpath('address', xpath='./*[{}]//div[@data-testid="property-region"]/text()'.format(nth))
+        basic_info.add_xpath('url', xpath='./*[{}]//a/@href'.format(nth))
+        basic_house = item_loader.load_item()
+        basic_house['url'] = get_rel_url(url, basic_house['url'])
+        return basic_house
\ No newline at end of file
diff --git a/trulia_scraper/spiders/trulia_sold.py b/trulia_scraper/spiders/trulia_sold.py
index d973d3b..ef3c25d 100644
--- a/trulia_scraper/spiders/trulia_sold.py
+++ b/trulia_scraper/spiders/trulia_sold.py
@@ -1,49 +1,49 @@
-# -*- coding: utf-8 -*-
-import os
-import scrapy
-from scrapy.linkextractors import LinkExtractor
-import trulia_scraper.parsing as parsing
-from trulia_scraper.items import TruliaItem, TruliaItemLoader
-import trulia_scraper.spiders.trulia as trulia
-from scrapy.utils.conf import closest_scrapy_cfg
-
-
-class TruliaSpider(scrapy.Spider):
-    name = 'trulia_sold'
-    allowed_domains = ['trulia.com']
-    custom_settings = {'FEED_URI': os.path.join(os.path.dirname(closest_scrapy_cfg()), 'data/data_sold_%(state)s_%(city)s_%(time)s.jl'), 
-                       'FEED_FORMAT': 'jsonlines'}
-
-    def __init__(self, state='CA', city='San_Francisco', *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.state = state
-        self.city = city
-        self.start_urls = ['http://trulia.com/sold/{city},{state}/'.format(state=state, city=city)]
-        self.le = LinkExtractor(allow=r'^https://www.trulia.com/homes/.+/sold/')
-
-    def parse(self, response):
-        N = trulia.TruliaSpider.get_number_of_pages_to_scrape(response)
-        self.logger.info("Determined that property pages are contained on {N} different index pages, each containing at most 30 properties. Proceeding to scrape each index page...".format(N=N))
-        for url in [response.urljoin("{n}_p/".format(n=n)) for n in range(1, N+1)]:
-            yield scrapy.Request(url=url, callback=self.parse_index_page)
-
-    def parse_index_page(self, response):
-        for link in self.le.extract_links(response):
-            yield scrapy.Request(url=link.url, callback=self.parse_property_page)
-
-    def parse_property_page(self, response):
-        item_loader = TruliaItemLoader(item=TruliaItem(), response=response)
-        trulia.TruliaSpider.load_common_fields(item_loader=item_loader, response=response)
-
-        details = item_loader.nested_css('.homeDetailsHeading')
-        taxes = details.nested_xpath('.//*[text() = "Property Taxes and Assessment"]/parent::div')
-        taxes.add_xpath('property_tax_assessment_year', './following-sibling::div/div[contains(text(), "Year")]/following-sibling::div/text()')
-        taxes.add_xpath('property_tax', './following-sibling::div/div[contains(text(), "Tax")]/following-sibling::div/text()')
-        taxes.add_xpath('property_tax_assessment_land', './following-sibling::div/div/div[contains(text(), "Land")]/following-sibling::div/text()')
-        taxes.add_xpath('property_tax_assessment_improvements', './following-sibling::div/div/div[contains(text(), "Improvements")]/following-sibling::div/text()')
-        taxes.add_xpath('property_tax_assessment_total', './following-sibling::div/div/div[contains(text(), "Total")]/following-sibling::div/text()')
-        taxes.add_xpath('property_tax_market_value', './following-sibling::div/div[contains(text(), "Market Value")]/following-sibling::div/text()')
-
-        item = item_loader.load_item()
-        trulia.TruliaSpider.post_process(item=item)
-        return item
+# # -*- coding: utf-8 -*-
+# import os
+# import scrapy
+# from scrapy.linkextractors import LinkExtractor
+# import trulia_scraper.parsing as parsing
+# from trulia_scraper.items import TruliaItem, TruliaItemLoader
+# import trulia_scraper.spiders.trulia as trulia
+# from scrapy.utils.conf import closest_scrapy_cfg
+#
+#
+# class TruliaSpider(scrapy.Spider):
+#     name = 'trulia_sold'
+#     allowed_domains = ['trulia.com']
+#     custom_settings = {'FEED_URI': os.path.join(os.path.dirname(closest_scrapy_cfg()), 'data/data_sold_%(state)s_%(city)s_%(time)s.jl'),
+#                        'FEED_FORMAT': 'jsonlines'}
+#
+#     def __init__(self, state='CA', city='San_Francisco', *args, **kwargs):
+#         super().__init__(*args, **kwargs)
+#         self.state = state
+#         self.city = city
+#         self.start_urls = ['http://trulia.com/sold/{city},{state}/'.format(state=state, city=city)]
+#         self.le = LinkExtractor(allow=r'^https://www.trulia.com/homes/.+/sold/')
+#
+#     def parse(self, response):
+#         N = trulia.TruliaSpider.get_number_of_pages_to_scrape(response)
+#         self.logger.info("Determined that property pages are contained on {N} different index pages, each containing at most 30 properties. Proceeding to scrape each index page...".format(N=N))
+#         for url in [response.urljoin("{n}_p/".format(n=n)) for n in range(1, N+1)]:
+#             yield scrapy.Request(url=url, callback=self.parse_index_page)
+#
+#     def parse_index_page(self, response):
+#         for link in self.le.extract_links(response):
+#             yield scrapy.Request(url=link.url, callback=self.parse_property_page)
+#
+#     def parse_property_page(self, response):
+#         item_loader = TruliaItemLoader(item=TruliaItem(), response=response)
+#         trulia.TruliaSpider.load_common_fields(item_loader=item_loader, response=response)
+#
+#         details = item_loader.nested_css('.homeDetailsHeading')
+#         taxes = details.nested_xpath('.//*[text() = "Property Taxes and Assessment"]/parent::div')
+#         taxes.add_xpath('property_tax_assessment_year', './following-sibling::div/div[contains(text(), "Year")]/following-sibling::div/text()')
+#         taxes.add_xpath('property_tax', './following-sibling::div/div[contains(text(), "Tax")]/following-sibling::div/text()')
+#         taxes.add_xpath('property_tax_assessment_land', './following-sibling::div/div/div[contains(text(), "Land")]/following-sibling::div/text()')
+#         taxes.add_xpath('property_tax_assessment_improvements', './following-sibling::div/div/div[contains(text(), "Improvements")]/following-sibling::div/text()')
+#         taxes.add_xpath('property_tax_assessment_total', './following-sibling::div/div/div[contains(text(), "Total")]/following-sibling::div/text()')
+#         taxes.add_xpath('property_tax_market_value', './following-sibling::div/div[contains(text(), "Market Value")]/following-sibling::div/text()')
+#
+#         item = item_loader.load_item()
+#         trulia.TruliaSpider.post_process(item=item)
+#         return item
diff --git a/utils.py b/utils.py
new file mode 100644
index 0000000..fc2eabe
--- /dev/null
+++ b/utils.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+# @Time    : 19-10-22 下午3:46
+# @Author  : RenMeng
+
+from urllib.parse import urljoin
+from urllib.parse import urlparse
+from urllib.parse import urlunparse
+from posixpath import normpath
+
+
+def get_rel_url(cur_url, rel_url):
+    if not rel_url.startswith("./") and not rel_url.startswith("/"):
+        return rel_url
+    url1 = urljoin(cur_url, rel_url)
+    arr = urlparse(url1)
+    path = normpath(arr[2])
+    return urlunparse((arr.scheme, arr.netloc, path, arr.params, arr.query, arr.fragment))
+

From b8575579c4b81e03dabd615bd80e1862687066f0 Mon Sep 17 00:00:00 2001
From: RenMeng <853750873@qq.com>
Date: Fri, 1 Nov 2019 10:46:16 +0800
Subject: [PATCH 2/2] =?UTF-8?q?=E6=B7=BB=E5=8A=A0red=E6=B7=BB=E5=8A=A0?=
 =?UTF-8?q?=E7=88=AC=E8=99=AB=20for=20redfin?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                        |   8 +-
 csv_to_json.py                   |  43 +++
 proxies.txt                      |   0
 redfin.py                        | 532 +++++++++++++++++++++++++++++++
 redfin_run.py                    |   9 +
 trulia_scraper/spiders/trulia.py |   4 +-
 6 files changed, 593 insertions(+), 3 deletions(-)
 create mode 100644 csv_to_json.py
 create mode 100644 proxies.txt
 create mode 100644 redfin.py
 create mode 100644 redfin_run.py

diff --git a/README.md b/README.md
index 5ef7732..848b994 100644
--- a/README.md
+++ b/README.md
@@ -3,6 +3,12 @@ Details please refer to [khpeek](https://github.com/khpeek/trulia-scraper)
 
 ## Basic usage
 1. `trulia`, which scrapes all real estate listings which are _for sale_ in a given state and city starting from a URL such as [https://www.trulia.com/CA/San_Francisco/](https://www.trulia.com/CA/San_Francisco/);
+'scrapy crawl trulia -output data.csv'
 
 ## To do
-1. `trulia_sold`, which similarly scrapes listings of recently _sold_ properties starting from a URL such as [https://www.trulia.com/sold/San_Francisco,CA/](https://www.trulia.com/sold/San_Francisco,CA/).
\ No newline at end of file
+1. `trulia_sold`, which similarly scrapes listings of recently _sold_ properties starting from a URL such as [https://www.trulia.com/sold/San_Francisco,CA/](https://www.trulia.com/sold/San_Francisco,CA/).
+
+
+# Selenium+Bs4 for redfin
+## Basic usage
+`python redfin_run.py`
\ No newline at end of file
diff --git a/csv_to_json.py b/csv_to_json.py
new file mode 100644
index 0000000..63d58bf
--- /dev/null
+++ b/csv_to_json.py
@@ -0,0 +1,43 @@
+# -*- coding: utf-8 -*-
+# @Time    : 19-10-31 下午3:31
+# @Author  : RenMeng
+
+import pandas as pd
+import json
+import re
+
+data = pd.read_csv('data.csv')
+data = data.fillna('')
+nrows = data.shape[0]
+
+for j in range(nrows):
+
+    webdata = data.iloc[j].to_dict()
+    tars = ['overview', 'property_taxes', 'new_homes',
+            'price_history', 'similar_homes', 'new_listing', 'comparable_sales']
+
+    for key in webdata:
+        if key == 'local_commons':
+            webdata[key] = re.sub('[^0-9A-Za-z!\?\.,:\"\"\'\' \n]', '', webdata[key])
+        if key in tars and webdata[key] != '':
+            try:
+                webdata[key] = eval(webdata[key])
+            except:
+                v = webdata[key]
+                v = re.findall('\((datetime.*?[^0-9])\)', v)
+                new_v = []
+                for _ele in v:
+                    new_ele = re.sub('\)|datetime\.datetime\(', '', _ele).split(',')
+                    new_v.append(['-'.join([i.strip() for i in new_ele[:3]]), new_ele[-2], eval(new_ele[-1])])
+                webdata[key] = new_v
+        elif key != '':
+            ele = webdata[key].split('\n')
+            if len(ele) > 1:
+                webdata[key] = ele
+
+        if key == 'comparable_sales':
+            webdata[key] = [{ele[0]: ele[1] for ele in line} for line in webdata[key]]
+
+
+    open('./result/trulia/trulia_output_{:d}.json'.format(j), 'w', encoding='utf-8').\
+                    write(json.dumps(webdata, indent=4, ensure_ascii=False))
\ No newline at end of file
diff --git a/proxies.txt b/proxies.txt
new file mode 100644
index 0000000..e69de29
diff --git a/redfin.py b/redfin.py
new file mode 100644
index 0000000..186b735
--- /dev/null
+++ b/redfin.py
@@ -0,0 +1,532 @@
+# -*- coding: utf-8 -*-
+# @Time    : 19-10-28 上午10:41
+# @Author  : RenMeng
+
+# coding=utf-8
+
+# from selenium import webdriver
+# url = 'https://www.google.com/recaptcha/api2/demo'
+# browser = webdriver.Chrome()
+# browser.get(url)
+# browser.find_elements_by_tag_name("iframe")[0]
+
+
+from time import sleep
+import requests
+from selenium import webdriver
+from bs4 import BeautifulSoup
+from collections import OrderedDict
+import re
+import json
+from random import choice, randint
+from selenium.webdriver import Firefox
+from selenium.webdriver.firefox.webdriver import FirefoxProfile
+from bs4 import NavigableString
+
+import urllib3
+urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
+
+reg_property_history_row = re.compile('propertyHistory\-[0-9]+')
+reg_offerinsight_row = re.compile('offerInsights\-[0-9]+')
+reg_property_urls = re.compile('(/[A-Z][A-Z]/[A-Za-z\-/0-9]+/home/[0-9]+)')
+user_agent_header = {
+    'User-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36'}
+
+
+class RedFin():
+    def __init__(self):
+        self.start_url = 'https://www.redfin.com/zipcode/94043'
+        # self.session = requests.Session()
+        self.use_selenium = True
+        #  proxy option can be set after class object is loaded
+        # self.use_proxies = True
+        self.output_data = []
+        self.property_urls = []
+        PROXY = "http://127.0.0.1:8118"  # IP:PORT or HOST:PORT
+        chrome_options = webdriver.ChromeOptions()
+        chrome_options.add_argument('--proxy-server=%s' % PROXY)
+        self.driver = webdriver.Chrome(options=chrome_options)
+
+        #  load proxies from file one per line proxy:port format
+        # self.proxies = [l.rstrip() for l in open('proxies.txt').readlines()]
+        #  make a separate session for each proxy
+        # self.sessions = {}
+        # for proxy in self.proxies:
+        #     self.sessions[proxy] = {
+        #         'session': requests.Session(),
+        #         'proxy': {'http': 'http://' + proxy,
+        #                   'https': 'https://' + proxy}
+        #     }
+        # load data collected so far in order to avoid needing to scrape
+        #  the same data twice
+        # try:
+        #     self.output_data = json.loads(open('redfin_output.json').read())
+        # except:
+        #     self.output_data = []
+
+    def rand_sleep(self):
+        #  you can set the random sleep time for no browser mode here
+        sleep(randint(5, 10))
+
+    def parse_finished_urls(self):
+        #  function for removing urls that have already completed
+        done_urls_list = set()
+        for property_data in self.output_data:
+            url = property_data['url'][22:]
+            done_urls_list.add(url)
+            if url in self.property_urls: self.property_urls.remove(url)
+        print(str(len(done_urls_list)) + ' properties already done')
+        print(str(len(self.property_urls)) + ' proeprties to go')
+
+    def get_search_results(self):
+        page_source = self.request_search_page(self.start_url)
+        self.property_urls = reg_property_urls.findall(page_source.replace('\\u002F', '/'))
+        self.property_urls = list(set(self.property_urls))
+        print('found ' + str(len(self.property_urls)) + ' results')
+        self.parse_finished_urls()
+
+    def request_search_page(self, page_url):
+        if self.use_selenium:
+            self.use_selenium = False
+            return self.get_page_selenium(page_url)
+        else:
+            return self.make_page_request(page_url)
+
+    def get_property_data(self):
+        count = 0
+        for property_url in self.property_urls:
+            webdata = self.get_property_page(property_url)
+            count += 1
+            open('./result/redfin_output_{:d}.json'.format(count), 'w', encoding='utf-8').\
+                write(json.dumps(webdata, indent=4, ensure_ascii=False))
+            print('finished page ' + str(count))
+            self.output_data.append(webdata)
+
+    def make_page_request(self, property_url):
+        self.rand_sleep()
+        if self.use_selenium:
+            return self.get_page_selenium('https://www.redfin.com' + property_url)
+        # elif self.use_proxies:
+        #     return self.make_page_request_proxy(property_url)
+        else:
+            return self.make_page_request_no_proxy('https://www.redfin.com' + property_url)
+
+    def make_page_request_no_proxy(self, property_url):
+        #  use a loop to handle various http request errors and retry
+        #  if 10 fails reached assume we've been blcoked
+        # for i in range(10):
+        #     try:
+        #         http_response = self.session.get(property_url, headers=user_agent_header, verify=False)
+        #         if http_response.status_code == 200: break
+        #     except Exception as e:
+        #         print(1, 'Request error')
+        #     if i == 9: print(1, 'blocked error');exit()
+        # return http_response.text
+        self.driver.get(property_url)
+        return self.driver.page_source
+
+    def make_page_request_proxy(self, property_url):
+        #  use a loop to handle various http request errors and retry
+        #  if 10 fails reached assume we've been blcoked
+        for i in range(10):
+            try:
+                session = self.sessions[choice(self.proxies)]
+                http_response = session['session'].get(property_url, headers=user_agent_header,
+                                                       proxies=session['proxy'], verify=False)
+                if http_response.status_code == 200: break
+            except Exception as e:
+                print(2, 'Request error')
+            if i == 9: print(2, 'blocked error');exit()
+        return http_response.text
+
+    def get_property_page(self, property_url):
+        page_source = self.make_page_request(property_url)
+        return self.parse_property_page(page_source, property_url)
+
+    def parse_property_page(self, page_source, property_url):
+        self.soup = BeautifulSoup(page_source, 'html.parser')
+        property_data = OrderedDict()
+
+        # basic_info
+        property_data['basic_info'] = OrderedDict()
+        property_data['basic_info']['url'] = 'https://www.redfin.com' + property_url
+        #  use try catch to handle when a data point is not available
+        try:
+            property_data['basic_info']['street_address'] = \
+                self.soup.find('span', attrs={'class': 'street-address'}).get_text()
+        except:
+            print('street_address not found')
+
+        try:
+            property_data['basic_info']['address_locality'] = \
+                self.soup.find('span', attrs={'class': 'citystatezip'}).get_text()
+        except:
+            print('address_locality not found')
+
+        try:
+            property_data['basic_info']['price'] = \
+                self.soup.find('div', attrs={'class': 'info-block price'}).find('div').get_text()
+        except:
+            print('price not found')
+
+        try:
+            property_data['basic_info']['beds'] = \
+                self.soup.find('div', attrs={'data-rf-test-id': 'abp-beds'}).find('div').get_text()
+        except:
+            print('beds not found')
+
+        try:
+            property_data['basic_info']['baths'] = \
+                self.soup.find('div', attrs={'data-rf-test-id': 'abp-baths'}).find('div').get_text()
+        except:
+            print('baths not found')
+
+        try:
+            property_data['basic_info']['sqFt'] = \
+                ' '.join([item.get_text() for item in
+                          (self.soup.select('span.statsValue') + self.soup.select('span.sqft-label'))])
+        except:
+            print('sqFt not found')
+
+        try:
+            property_data['basic_info']['price_per_sqFt'] = \
+                self.soup.find('div', attrs={'data-rf-test-id': 'abp-sqFt'}).\
+                    find('div', attrs={"data-rf-test-id": "abp-priceperft"}).get_text()
+        except:
+            print('price_per_sqFt not found')
+
+        try:
+            property_data['basic_info']['redfin estimate'] =\
+                self.soup(text=re.compile('Redfin Estimate:'))[0].parent.parent.parent.\
+                    next_sibling.get_text()
+        except:
+            print('redfin estimate not found')
+
+        try:
+            property_data['basic_info']['days on Redfin'] = \
+                self.soup(text=re.compile('On Redfin'))[0].parent.next_sibling.get_text()
+        except:
+            print('days on Redfin not found')
+
+        try:
+            property_data['basic_info']['year_built'] = \
+                self.soup.find('span', attrs={"data-rf-test-id": "abp-yearBuilt"}).\
+                    find('span', attrs={'class': 'value'}).get_text()
+        except:
+            print('year_built not found')
+
+        try:
+            property_data['basic_info']['status'] = \
+                self.soup.find('span', attrs={"data-rf-test-id": "abp-status"}).\
+                    find('span', attrs={'class': 'value'}).get_text()
+        except:
+            print('status not found')
+
+        # overview
+        overview = {}
+        try:
+            overview['describe'] = self.soup.select('div.house-info')[0].\
+                select('div[class*="remarks"]')[0].get_text()
+        except:
+            overview['describe'] = 'not found'
+        details = OrderedDict()
+        try:
+            for child in self.soup.find('div', attrs={'class': 'keyDetailsList'}).children:
+                cells = list(child.children)
+                details[cells[0].get_text().strip()] = cells[1].get_text().strip()
+        except:
+            pass
+        overview['detail'] = details
+        property_data['overview'] = overview
+
+        # use loops to maintain data structure ina dict
+        property_data['property_details'] = OrderedDict()
+        try:
+            for category in self.soup.find('div', attrs={'class': 'amenities-container'}).children:
+                if category.get('class')[0] == 'super-group-title':
+                    key = category.contents[0]
+                elif category.get('class')[0] == 'super-group-content':
+                    property_data['property_details'][key] = OrderedDict()
+                    for row in category.find_all('div', attrs={'class': 'amenity-group'}):
+                        key2 = row.find('h3').get_text()
+                        property_data['property_details'][key][key2] = []
+                        for row2 in row.find_all('li'):
+                            property_data['property_details'][key][key2].append(row2.get_text())
+        except:
+            pass
+
+        property_data['propert_history'] = []
+        try:
+            for row in self.soup.find_all('tr', attrs={'id': reg_property_history_row}):
+                data_cells = row.find_all('td')
+                history_data_row = OrderedDict()
+                history_data_row['date'] = data_cells[0].get_text()
+                history_data_row['event & source'] = data_cells[1].get_text()
+                history_data_row['price'] = data_cells[2].get_text()
+                history_data_row['appreciation'] = data_cells[3].get_text()
+                property_data['propert_history'].append(history_data_row)
+        except:
+            pass
+
+
+        property_data['school'] = OrderedDict()
+        try:
+            school_tabs = [item.get_text() for item in self.soup.find('div', attrs={'class':'scrollable tabs'})]
+            for tab in school_tabs:
+                self.driver.find_element_by_xpath('//button[text()="{}"]'.format(tab)).click()
+                self.soup = BeautifulSoup(self.driver.page_source, 'html.parser')
+                school_table = self.soup.select('div.schools-content')[0].select('tr.schools-table-row')
+                thead = [item.get_text() for item in school_table[0].find_all('th')]
+                tbody = [[item for item in trow.find_all('td')] for trow in school_table[1:]]
+                col_num = len(thead)
+                tbody = [[row[i].find('div', attrs={'data-tf-test-name', 'school-name'}).get_text()
+                          if i == 0 else row[i].get_text() for i in range(col_num)] for row in tbody]
+                school_item = [{thead[i]: row[i] for i in range(col_num)} for row in tbody]
+                property_data['school'][tab] = school_item
+        except:
+            pass
+
+        property_data['insights'] = []
+        try:
+            for item in self.soup.find('div',
+                                  attrs={'data-rf-test-id': 'tourInsights'}).select('div.currentTourInsights')[0].children:
+                common = OrderedDict()
+                common['note'] = item.select('div.note')[0].get_text()
+                common['agent-info'] = item.select('div.agent-info')[0].find('div').get_text()
+                common['date'] = item.select('div.agent-info')[0].find('span', attrs={'class': 'date'}).get_text()
+                property_data['insights'].append(common)
+        except:
+            pass
+
+        property_data['activity'] = []
+        try:
+            for item in self.soup.find('div',
+                                       attrs={'data-rf-test-id': 'activitySection'}).find_all('td'):
+                property_data['activity'].append(' '.join([child.get_text() for child in item.select('div.labels')[0].contents]))
+        except:
+            pass
+
+        property_data['public-facts'] = OrderedDict()
+        try:
+            for child in self.soup.select('div.public-records-taxes')[0].children:
+                key = child.find('h3').get_text()
+                property_data['public-facts'][key] = OrderedDict()
+                for tr in child.find_all('tr'):
+                    cells = list(tr.children)
+                    property_data['public-facts'][key][cells[0].get_text()] = cells[1].get_text()
+            property_data['public-facts']['home-facts'] = OrderedDict()
+            for child in self.soup.select('div.facts-table')[0].select('div.table-row'):
+                cells = list(child.contents)
+                property_data['public-facts']['home-facts'][cells[0].get_text()] = cells[1].get_text()
+        except:
+            pass
+
+        try:
+            for child in self.soup.select('div.public-records-taxes')[0].children:
+                key = child.find('h3').get_text()
+                property_data['public-facts'][key] = OrderedDict()
+                for tr in child.find_all('tr'):
+                    cells = list(tr.children)
+                    property_data['public-facts'][key][cells[0].get_text()] = cells[1].get_text()
+            property_data['public-facts']['home-facts'] = OrderedDict()
+            for child in self.soup.select('div.facts-table')[0].select('div.table-row'):
+                cells = list(child.contents)
+                property_data['public-facts']['home-facts'][cells[0].get_text()] = cells[1].get_text()
+        except:
+            pass
+
+        try:
+            key = self.soup.select('#redfin-estimate')[0].find('h2').get_text()
+            property_data[key] = OrderedDict()
+            property_data[key]['EstimateValue'] = self.soup.select('#redfin-estimate')[0].select(
+                'div[class*="RedfinEstimateValueHeader"]')[0].get_text()
+            property_data[key]['PriceDiff'] = self.soup.select('#redfin-estimate')[0].select(
+                'div[class*="listPriceDiff"]')[0].get_text()
+            property_data[key]['comps'] = OrderedDict()
+            property_data[key]['comps']['based_on'] = self.soup.select('#redfin-estimate')[0].select(
+                'div.comps')[0].contents[0].get_text()
+            property_data[key]['comps']['homecard'] = []
+            for node in self.soup.select('#redfin-estimate')[0].select('div.comps')[0].select(
+                    'div.homecard'):
+                card = {}
+                card['url'] = 'https://www.redfin.com' + node.find('a')['href']
+                card['sold_date'] = [item.get_text() for item in node.select('div.topleft')[0]]
+                card['details'] = [item.get_text() for item in node.select('div.left')[0]] + \
+                                  [item.get_text() for item in node.select('div.right')[0].contents[0].children]
+                property_data[key]['comps']['homecard'].append(card)
+        except:
+            print('redfin-estimate not found')
+
+
+        # try:
+        #     key = self.soup.select('#redfin-estimate')[0].find('h2').get_text()
+        #     property_data[key] = OrderedDict()
+        # except:
+        #     pass
+
+        try:
+            key = []
+            for child in self.soup.find('div', attrs={'data-rf-test-id': 'neighborhoodSection'}).find('h2').children:
+                if isinstance(child, NavigableString):
+                    key.append(child)
+                else:
+                    key += [item.get_text().strip() for item in child.children if item.name != 'script']
+            key = ' '.join(key)
+            property_data[key] = OrderedDict()
+            key2 = self.soup.find('div', attrs={'data-rf-test-id':'neighborhoodSection'}).select(
+                'h3[class*="walkscore-header"]')[0].get_text().strip()
+            property_data[key][key2] = []
+            for child in self.soup.find('div', attrs={'data-rf-test-id': 'neighborhoodSection'}).select(
+                    'div.walk-score')[0].select('div.scrollable')[0].contents[0].children:
+                property_data[key][key2].append(' '.join([i.get_text() for i in child.children]))
+            desc = self.soup.find('div', attrs={'data-rf-test-id':'neighborhoodSection'}).\
+                select('div.desc.blurb')[0].get_text()
+            property_data[key][key2].append(desc)
+            try:
+                key3 = self.soup.find('div', attrs={'data-rf-test-id':'neighborhoodSection'}).select(
+                    'div.OfferInsights')[0].find('h3').get_text()
+                property_data[key][key3] = OrderedDict()
+                for tr in self.soup.find('div', attrs={'data-rf-test-id': 'neighborhoodSection'}).select(
+                        'div.OfferInsights')[0].find('table', attrs={'class': 'basic-table'}).find_all('tr'):
+                    for td in tr.find_all('td'):
+                        cell = list(td.children)
+                        property_data[key][key3][cell[0].get_text().strip()] = cell[1].get_text().strip()
+            except:
+                pass
+
+            # 4th
+            try:
+                key4 = self.soup.find('div', attrs={'data-rf-test-id': 'neighborhoodSection'}).\
+                    select('div.title.primary-heading.h3')[0].get_text()
+                property_data[key][key4] = []
+                for row in self.soup.find('div', attrs={'data-rf-test-id': 'neighborhoodSection'}).\
+                        find_all('li', attrs={'id': reg_offerinsight_row}):
+                    line = OrderedDict()
+                    target_value = ['offer-value', 'sale-date', 'home-stats',
+                                    'offer-result-line', 'offer-insight', ]
+                    for _v in target_value:
+                        try:
+                            line[_v] = row.select('div.{}'.format(_v))[0].get_text()
+                        except:
+                            pass
+                    try:
+                        line['agent-info'] = row.select('div.agent-info')[0].select('span.agent-detail-name')[0].get_text()
+                    except:
+                        pass
+                    property_data[key][key4].append(line)
+            except:
+                    pass
+
+            # 5th
+            try:
+                key5 = self.soup.find('div', attrs={'data-rf-test-id':'neighborhoodSection'}).\
+                    select('div.statsAndChartsContainer')[0].find('h3').get_text()
+                property_data[key][key5] = []
+                table = self.soup.find('div', attrs={'data-rf-test-id':'neighborhoodSection'}).\
+                    select('div.statsAndChartsContainer')[0].find('table', attrs={'class': 'basic-table'})
+                header = [th.get_text() for th in table.find('thead').find_all('th')]
+                header_num = len(header)
+                for tr in table.find('tbody').find_all('tr'):
+                    line = OrderedDict()
+                    value = [td if isinstance(td, NavigableString) else td.get_text()
+                             for td in tr.find_all('td')]
+                    for i in range(header_num):
+                        line[header[i]] = value[i]
+                    property_data[key][key5].append(line)
+
+            except:
+                pass
+
+        except:
+            print('neighborhood info not found')
+
+        try:
+            key = 'Nearby Similar Homes'
+            property_data[key] = OrderedDict()
+            try:
+                children = list(self.soup(text=re.compile(key))[1].parent.next_sibling.children)
+            except:
+                children = list(self.soup(text=re.compile(key))[0].parent.next_sibling.children)
+
+            property_data[key]['desc'] = children[0].get_text()
+            property_data[key]['home_list'] = []
+            for child in children[1].find_all('div', attrs={'class': 'SimilarHomeCardReact'}):
+                home_card = {}
+                home_card['url'] = 'https://www.redfin.com' + child.find('a')['href']
+                details = []
+                try:
+                    details.append(child.select('div.topleft')[0].get_text())
+                except:
+                    pass
+                for item in child.select('div.bottomV2')[0].children:
+                    if item.name == 'script':
+                        continue
+                    details += [i if isinstance(i, NavigableString) else i.get_text() for i in item.children]
+                print(details)
+                home_card['details'] = ' '.join(details)
+                property_data[key]['home_list'].append(home_card)
+        except:
+            print('similar list not found')
+
+        try:
+            key = 'Nearby Recently Sold Homes'
+            property_data[key] = OrderedDict()
+            children = list(self.soup(text=re.compile(key))[0].parent.next_sibling.children)
+            property_data[key]['desc'] = children[0].get_text()
+            property_data[key]['home_list'] = []
+            for child in children[1].find_all('div', attrs={'class': 'SimilarHomeCardReact'}):
+                home_card = {}
+                home_card['url'] = 'https://www.redfin.com' + child.find('a')['href']
+                details = []
+                try:
+                    details.append(child.select('div.topleft')[0].get_text())
+                except:
+                    pass
+                for item in child.select('div.bottomV2')[0].children:
+                    if item.name == 'script':
+                        continue
+                    details += [i if isinstance(i, NavigableString) else i.get_text() for i in item.children]
+                home_card['details'] = ' '.join(details)
+                property_data[key]['home_list'].append(home_card)
+        except:
+            print('recent sold not found')
+        print(property_data)
+        return property_data
+
+    def use_browser(self):
+        self.use_selenium = True
+        firefox_profile = FirefoxProfile()
+        #  might as well turn off images since we don't need them
+        if self.use_proxies:
+            #  if use proxies is true load firefox with proxies
+            firefox_profile.set_preference("permissions.default.image", 2)
+            proxy_host, proxy_port = choice(self.proxies).split(':')
+            firefox_profile.set_preference("network.proxy.type", 1)
+            firefox_profile.set_preference("network.proxy.http", proxy_host)
+            firefox_profile.set_preference("network.proxy.http_port", int(proxy_port))
+            firefox_profile.set_preference("network.proxy.ssl", proxy_host)
+            firefox_profile.set_preference("network.proxy.ssl_port", int(proxy_port))
+        self.driver = Firefox(firefox_profile)
+        self.driver.implicitly_wait(2)
+
+    def get_page_selenium(self, page_url):
+        self.driver.get(page_url)
+        self.selenium_bypass_captcha()
+        return self.driver.page_source
+
+    def selenium_bypass_captcha(self):
+        #  basic code for handling captcha
+        #  this requires the user to actually solve the captcha and then continue
+        # try:
+        print('do check.....')
+        self.driver.switch_to.frame(self.driver.find_elements_by_tag_name("iframe")[0])
+        self.driver.find_element_by_class_name('recaptcha-checkbox-border').click()
+        print('solve captcha ( pop up only ) and press enter to continue')
+        input()
+        self.driver.switch_to.default_content()
+        self.driver.find_element_by_id('submit').click()
+        # except Exception as e:
+        #     pass
+
+
+
diff --git a/redfin_run.py b/redfin_run.py
new file mode 100644
index 0000000..03ad725
--- /dev/null
+++ b/redfin_run.py
@@ -0,0 +1,9 @@
+# -*- coding: utf-8 -*-
+# @Time    : 19-10-28 上午10:47
+# @Author  : RenMeng
+
+from redfin import RedFin
+
+redfin = RedFin()
+redfin.get_search_results()
+redfin.get_property_data()
\ No newline at end of file
diff --git a/trulia_scraper/spiders/trulia.py b/trulia_scraper/spiders/trulia.py
index 974f5a6..d2b6a2d 100644
--- a/trulia_scraper/spiders/trulia.py
+++ b/trulia_scraper/spiders/trulia.py
@@ -24,7 +24,7 @@ class TruliaSpider(scrapy.Spider):
     name = 'trulia'
     allowed_domains = ['trulia.com']
     custom_settings = {'FEED_URI': os.path.join(os.path.dirname(closest_scrapy_cfg()), 'data/data_for_sale_%(state)s_%(city)s_%(time)s.jl'), 
-                       'FEED_FORMAT': 'jsonlines',
+                       'FEED_FORMAT': 'json',
                        'FEED_EXPORT_FIELDS': cols}
 
     def __init__(self, state='CA', city='Mountain_View', *args, **kwargs):
@@ -32,7 +32,7 @@ def __init__(self, state='CA', city='Mountain_View', *args, **kwargs):
         self.state = state
         self.city = city
         # self.start_urls = ['http://trulia.com/{state}/{city}'.format(state=state, city=city)]
-        self.start_urls = ['http://trulia.com/{state}/{city}'.format(state=state, city=city)]
+        self.start_urls = ['http://trulia.com/{state}/{city}/94043'.format(state=state, city=city)]
         # self.le = LinkExtractor(allow=[r'^https://www.trulia.com/p/ca', r'^https://www.trulia.com/property', r'^https://www.trulia.com/builder-community'])
         self.link_path = '//div[@data-testid="search-result-list-container"]//a/@href'