From f1078e4837b9f6df4e727af35f63ed3d69b40a64 Mon Sep 17 00:00:00 2001 From: RenMeng <853750873@qq.com> Date: Fri, 25 Oct 2019 17:41:29 +0800 Subject: [PATCH 1/2] scrapy trulia for-sale houses --- README.md | 35 +--- trulia_scraper/items.py | 173 ++++++++++------ trulia_scraper/middlewares.py | 27 +++ trulia_scraper/parsing.py | 15 +- trulia_scraper/pipelines.py | 5 +- trulia_scraper/settings.py | 8 + trulia_scraper/spiders/trulia.py | 273 +++++++++++++++++++++----- trulia_scraper/spiders/trulia_sold.py | 98 ++++----- utils.py | 18 ++ 9 files changed, 460 insertions(+), 192 deletions(-) create mode 100644 utils.py diff --git a/README.md b/README.md index 8783c57..5ef7732 100644 --- a/README.md +++ b/README.md @@ -1,35 +1,8 @@ -# trulia-scraper -Scraper for real estate listings on [Trulia.com](https://www.trulia.com/) implemented in Python with Scrapy. +# Update trulia-scraper +Details please refer to [khpeek](https://github.com/khpeek/trulia-scraper) ## Basic usage -To crawl the scraper, you need to install [Python 3](https://www.python.org/download/releases/3.0/), as well as the [Scrapy](https://pypi.python.org/pypi/Scrapy) framework and the [Pyparsing](https://pypi.python.org/pypi/pyparsing/2.2.0) module. The scraper features two spiders: - 1. `trulia`, which scrapes all real estate listings which are _for sale_ in a given state and city starting from a URL such as [https://www.trulia.com/CA/San_Francisco/](https://www.trulia.com/CA/San_Francisco/); -2. `trulia_sold`, which similarly scrapes listings of recently _sold_ properties starting from a URL such as [https://www.trulia.com/sold/San_Francisco,CA/](https://www.trulia.com/sold/San_Francisco,CA/). - -To crawl the `trulia_sold` spider for the state of `CA` and city of `San_Francisco` (the default locale), simply run the command - -``` -scrapy crawl trulia_sold -``` -from the project directory. To scrape listings for another city, specify the `city` and `state` arguments using the `-a` flag. For example, - -``` -scrapy crawl trulia_sold -a state=NY -a city=New_York -``` -will scrape all listings reachable from [https://www.trulia.com/sold/New_York,NY/](https://www.trulia.com/sold/New_York,NY/). - -By default, the scraped data will be stored (using Scrapy's [feed export](https://doc.scrapy.org/en/latest/topics/feed-exports.html)) in the `data` directory as a [JSON lines](http://jsonlines.org/) (`.jl`) file following the naming convention - -``` -data_{sold|for_sale}_{state}_{city}_{time}.jl -``` - -where `{sold|for_sale}` is `sold` or `for_sale` for the `trulia` and `trulia_sold` spiders, respectively, `{state}` and `{city}` are the specified state and city (e.g. `CA` and `San_Francisco`, respectively), and `{time}` represents the current UTC time. - -If you prefer a different output file name and format, you can specify this from the command line using Scrapy's `-o` option. For example, -``` -scrapy crawl trulia_sold -a state=WA -city=Seattle -o data_Seattle.csv -``` -will output the data in CSV format as `data_Seattle.csv`. (Scrapy automatically picks up the file format from the specified file extension). \ No newline at end of file +## To do +1. `trulia_sold`, which similarly scrapes listings of recently _sold_ properties starting from a URL such as [https://www.trulia.com/sold/San_Francisco,CA/](https://www.trulia.com/sold/San_Francisco,CA/). \ No newline at end of file diff --git a/trulia_scraper/items.py b/trulia_scraper/items.py index 6b7ac3d..8470809 100644 --- a/trulia_scraper/items.py +++ b/trulia_scraper/items.py @@ -1,70 +1,129 @@ # -*- coding: utf-8 -*- -from scrapy.loader import ItemLoader -from scrapy.loader.processors import TakeFirst, MapCompose, Identity, Compose +from scrapy.loader.processors import TakeFirst, Identity, Compose, Join import scrapy -from trulia_scraper.parsing import remove_empty, get_number_from_string +from trulia_scraper.parsing import * +class overview_item(scrapy.Item): + url = scrapy.Field( + output_processor=Compose(TakeFirst()) + ) + address = scrapy.Field( + output_processor=Compose(TakeFirst()) + ) + city_state = scrapy.Field( + output_processor=Compose(TakeFirst()) + ) + price = scrapy.Field( + output_processor=Compose(TakeFirst(),get_number_from_string) + ) # for items on sale only + area = scrapy.Field( + output_processor=Compose(TakeFirst(), get_number_from_string) + ) + bedrooms = scrapy.Field( + output_processor=Compose(TakeFirst(), float) + ) + bathrooms = scrapy.Field( + output_processor= Compose(TakeFirst(), float) + ) + year_built = scrapy.Field( + output_processor=Compose(TakeFirst(), int) + ) + lot_size = scrapy.Field( + output_processor=Compose(TakeFirst(), get_number_from_string) + ) + lot_size_units = scrapy.Field( + output_processor=Compose(TakeFirst()) + ) + price_per_square_foot = scrapy.Field( + output_processor=Compose(TakeFirst(), get_number_from_string) + ) + days_on_Trulia = scrapy.Field( + output_processor=Compose(TakeFirst(), int) + ) -class TruliaItem(scrapy.Item): - url = scrapy.Field() - address = scrapy.Field() - city_state = scrapy.Field() - price = scrapy.Field() # for items on sale only - neighborhood = scrapy.Field() - overview = scrapy.Field() - description = scrapy.Field() +class basic_info_item(scrapy.Item): + url = scrapy.Field( + output_processor=Compose(TakeFirst()) + ) + address = scrapy.Field( + output_processor=Compose(TakeFirst()) + ) + city_state = scrapy.Field( + output_processor=Compose(TakeFirst()) + ) + price = scrapy.Field( + output_processor=Compose(TakeFirst(), get_number_from_string) + ) # for items on sale only + area = scrapy.Field( + output_processor=Compose(TakeFirst(), get_number_from_string) + ) + bedrooms = scrapy.Field( + output_processor=Compose(TakeFirst(), float) + ) + bathrooms = scrapy.Field( + output_processor=Compose(TakeFirst(), float) + ) - # Columns from the 'price events' table are stored in separate lists - prices = scrapy.Field() - dates = scrapy.Field() - events = scrapy.Field() - # Property tax information is on 'sold' pages only - property_tax_assessment_year = scrapy.Field() - property_tax = scrapy.Field() - property_tax_assessment_land = scrapy.Field() - property_tax_assessment_improvements = scrapy.Field() - property_tax_assessment_total = scrapy.Field() - property_tax_market_value = scrapy.Field() +class price_item(scrapy.Item): + prices = scrapy.Field( + output_processor= Identity() + ) + dates = scrapy.Field( + output_processor= Compose(remove_empty) + ) + events = scrapy.Field( + output_processor= Compose(remove_empty) + ) - # The 'Features' sections is on 'for sale' pages only - listing_information = scrapy.Field() - listing_information_date_updated = scrapy.Field() - public_records = scrapy.Field() - public_records_date_updated = scrapy.Field() +class taxes_item(scrapy.Item): + property_tax_assessment_year = scrapy.Field( + output_processor=Compose(TakeFirst(), int) + ) + property_tax = scrapy.Field( + output_processor=Compose(TakeFirst(), get_number_from_string) + ) + property_tax_assessment_land = scrapy.Field( + output_processor=Compose(TakeFirst(), get_number_from_string) + ) + property_tax_assessment_improvements = scrapy.Field( + output_processor=Compose(TakeFirst(), get_number_from_string) + ) + property_tax_assessment_total = scrapy.Field( + output_processor=Compose(TakeFirst(), get_number_from_string) + ) + +class price_trends_item(scrapy.Item): + item1 = scrapy.Field( + output_processor=Compose(Join()) + ) + item2 = scrapy.Field( + output_processor=Compose(Join()) + ) + item3 = scrapy.Field( + output_processor=Compose(Join()) + ) - # Items generated from further parsing of 'raw' scraped data - area = scrapy.Field() - lot_size = scrapy.Field() - lot_size_units = scrapy.Field() - price_per_square_foot = scrapy.Field() # For properties on sale only - bedrooms = scrapy.Field() - bathrooms = scrapy.Field() - year_built = scrapy.Field() - days_on_Trulia = scrapy.Field() - views = scrapy.Field() - price_history = scrapy.Field() +class TruliaItem(scrapy.Item): + overview = scrapy.Field() + local_information = scrapy.Field() #todo + + description = scrapy.Field() + community_description = scrapy.Field() + home_detail = scrapy.Field() + office_hours = scrapy.Field() + open_house = scrapy.Field() + + price_history = scrapy.Field() + similar_homes = scrapy.Field() #todo + new_listing = scrapy.Field() #todo -class TruliaItemLoader(ItemLoader): - default_input_processor = MapCompose(str.strip) - default_output_processor = TakeFirst() + property_taxes = scrapy.Field() + price_trends = scrapy.Field() + comparable_sales = scrapy.Field() - price_out = Compose(TakeFirst(), lambda s: int(s.replace(',', ''))) - overview_out = Identity() - description_out = Compose(remove_empty) - prices_out = Identity() - dates_out = Compose(remove_empty) - events_out = Compose(remove_empty) + local_commons = scrapy.Field() #todo + new_homes = scrapy.Field() - listing_information_out = Identity() - public_records_out = Identity() - area_out = Compose(TakeFirst(), get_number_from_string) - lot_size_out = Compose(TakeFirst(), get_number_from_string) - price_per_square_foot_out = Compose(TakeFirst(), get_number_from_string) - bedrooms_out = Compose(TakeFirst(), int) - bathrooms_out = Compose(TakeFirst(), int) - year_built_out = Compose(TakeFirst(), int) - days_on_Trulia_out = Compose(TakeFirst(), lambda s: int(s.replace(',', ''))) - views_out = Compose(TakeFirst(), lambda s: int(s.replace(',', ''))) diff --git a/trulia_scraper/middlewares.py b/trulia_scraper/middlewares.py index ee550c4..72f9211 100644 --- a/trulia_scraper/middlewares.py +++ b/trulia_scraper/middlewares.py @@ -54,3 +54,30 @@ def process_start_requests(self, start_requests, spider): def spider_opened(self, spider): spider.logger.info('Spider opened: %s' % spider.name) + + +# class CustomProxyMiddleware(object): +# def process_request(self, request, spider): +# request.meta['proxy'] = "http://127.0.0.1:8118" + +# http agents +http_agents = [ + "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", + "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", + "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", + "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", + "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", + "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", + "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", + "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", + "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:66.0) Gecko/20100101 Firefox/66.0", +] + +def get_agent(): + import random + return random.choice(http_agents) + + +class UserAgentMiddleware(object): + def process_request(self, request, spider): + request.headers['User-Agent'] = get_agent() diff --git a/trulia_scraper/parsing.py b/trulia_scraper/parsing.py index 1ec9e2a..64f0964 100644 --- a/trulia_scraper/parsing.py +++ b/trulia_scraper/parsing.py @@ -1,7 +1,20 @@ +import re +KEYS = set(['schools', 'crime', 'commute', 'shop & eat']) def remove_empty(l): '''Remove items which evaluate to False (such as empty strings) from the input list.''' return [x for x in l if x] +def remove_key_words(l): + return [x for x in l if x.lower() not in KEYS] + def get_number_from_string(string, number_type=float): '''Remove commas from the input string and parse as a number''' - return number_type(string.replace(',', '')) \ No newline at end of file + string = string.replace('$', '') + return number_type(string.replace(',', '')) + +def match_quote(l): + result = re.findall('.*\"(.*)\".*', l) + if len(result) > 0: + return result[0] + else: + return '' \ No newline at end of file diff --git a/trulia_scraper/pipelines.py b/trulia_scraper/pipelines.py index 4714fd0..e316d6e 100644 --- a/trulia_scraper/pipelines.py +++ b/trulia_scraper/pipelines.py @@ -4,8 +4,9 @@ # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html - +from scrapy import signals +from scrapy.exporters import CsvItemExporter class TruliaScraperPipeline(object): def process_item(self, item, spider): - return item + return item \ No newline at end of file diff --git a/trulia_scraper/settings.py b/trulia_scraper/settings.py index 0f337dd..1aaa840 100644 --- a/trulia_scraper/settings.py +++ b/trulia_scraper/settings.py @@ -17,6 +17,8 @@ # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'trulia_scraper (+http://www.yourdomain.com)' +# USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' + # Obey robots.txt rules ROBOTSTXT_OBEY = True @@ -88,3 +90,9 @@ #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' + +DOWNLOADER_MIDDLEWARES = { + # 'trulia_scraper.middlewares.CustomProxyMiddleware': 10, + # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware': 20, + 'trulia_scraper.middlewares.UserAgentMiddleware': 543, +} \ No newline at end of file diff --git a/trulia_scraper/spiders/trulia.py b/trulia_scraper/spiders/trulia.py index 741152f..974f5a6 100644 --- a/trulia_scraper/spiders/trulia.py +++ b/trulia_scraper/spiders/trulia.py @@ -1,28 +1,43 @@ # -*- coding: utf-8 -*- import os +import re import scrapy import math import datetime from scrapy.linkextractors import LinkExtractor -from trulia_scraper.items import TruliaItem, TruliaItemLoader +from trulia_scraper.items import * from trulia_scraper.parsing import get_number_from_string from scrapy.utils.conf import closest_scrapy_cfg +from scrapy.loader import ItemLoader +from utils import get_rel_url +from itertools import zip_longest + class TruliaSpider(scrapy.Spider): + cols = ['overview', 'local_information', 'description', 'home_detail', + 'community_description', 'office_hours', 'open_house', + 'price_history', 'price_trends', 'property_taxes', + 'new_homes', 'similar_homes', 'new_listing', 'comparable_sales', + 'local_commons'] + name = 'trulia' allowed_domains = ['trulia.com'] custom_settings = {'FEED_URI': os.path.join(os.path.dirname(closest_scrapy_cfg()), 'data/data_for_sale_%(state)s_%(city)s_%(time)s.jl'), - 'FEED_FORMAT': 'jsonlines'} + 'FEED_FORMAT': 'jsonlines', + 'FEED_EXPORT_FIELDS': cols} - def __init__(self, state='CA', city='San_Francisco', *args, **kwargs): + def __init__(self, state='CA', city='Mountain_View', *args, **kwargs): super().__init__(*args, **kwargs) self.state = state self.city = city + # self.start_urls = ['http://trulia.com/{state}/{city}'.format(state=state, city=city)] self.start_urls = ['http://trulia.com/{state}/{city}'.format(state=state, city=city)] - self.le = LinkExtractor(allow=r'^https://www.trulia.com/property') + # self.le = LinkExtractor(allow=[r'^https://www.trulia.com/p/ca', r'^https://www.trulia.com/property', r'^https://www.trulia.com/builder-community']) + self.link_path = '//div[@data-testid="search-result-list-container"]//a/@href' def parse(self, response): + print(response.url, response.status) N = self.get_number_of_pages_to_scrape(response) self.logger.info("Determined that property pages are contained on {N} different index pages, each containing at most 30 properties. Proceeding to scrape each index page...".format(N=N)) for url in [response.urljoin("{n}_p/".format(n=n)) for n in range(1, N+1)]: @@ -30,64 +45,218 @@ def parse(self, response): @staticmethod def get_number_of_pages_to_scrape(response): - pagination = response.css('.paginationContainer').xpath('.//*/text()[contains(., "Results")]') - number_of_results = int(pagination.re_first(r'^1 - 30 of ([\d,]+) Results$').replace(',', '')) + pagination = response.xpath( + '//div[@data-testid="pagination-caption"]/text()').extract() + if len(pagination) > 0: + pagination = pagination[0] + pattern = '^1-30 of ([\d,]+) Results$' + number_of_results = int(re.findall(pattern, pagination)[0].replace(',', '')) + print(number_of_results) return math.ceil(number_of_results/30) def parse_index_page(self, response): - for link in self.le.extract_links(response): - yield scrapy.Request(url=link.url, callback=self.parse_property_page) + # print(len(self.le.extract_links(response))) + # for link in self.le.extract_links(response): + # yield scrapy.Request(url=link.url, callback=self.parse_property_page) + for url in response.xpath(self.link_path).extract(): + yield scrapy.Request(url=get_rel_url(response.url, url), callback=self.parse_property_page) def parse_property_page(self, response): - l = TruliaItemLoader(item=TruliaItem(), response=response) - self.load_common_fields(item_loader=l, response=response) + # overview + il = ItemLoader(item=overview_item(), response=response) + il.add_value('url', response.url) + overview_node = il.nested_xpath('//div[@data-testid="home-details-summary-container"]') + overview_node.add_xpath('address', './/span[@data-testid="home-details-summary-headline"]/text()') + overview_node.add_xpath('city_state', + './/span[@data-testid="home-details-summary-city-state"]/text()') + overview_node.add_xpath('price', + './/*[@data-testid="on-market-price-details"]//text()', + re=r'\$([\d,]+)') + overview_node.add_xpath('area', xpath='.//li//text()', re=r'^([\d,]+)\s?sqft$') + overview_node.add_xpath('bedrooms', xpath='.//li//text()', re=r'(\d+\.?\d?) (?:Beds|Bed|beds|bed)$') + overview_node.add_xpath('bathrooms', xpath='.//li//text()', re=r'(\d+\.?\d?) (?:Baths|Bath|baths|bath)$') - listing_information = l.nested_xpath('//span[text() = "LISTING INFORMATION"]') - listing_information.add_xpath('listing_information', './parent::div/following-sibling::ul[1]/li/text()') - listing_information.add_xpath('listing_information_date_updated', './following-sibling::span/text()', re=r'^Updated: (.*)') + details = il.nested_xpath('//div[@data-testid="features-container"]') + details.add_xpath('year_built', xpath='.//li//text()', re='Built in (\d+)') + details.add_xpath('lot_size', xpath='.//li//text()', re=r'Lot Size: ([\d,.]+) (?:acres|sqft)$') + details.add_xpath('lot_size_units', xpath='.//li//text()', re=r'Lot Size: [\d,.]+ (acres|sqft)$') + details.add_xpath('price_per_square_foot', xpath='.//li//text()', re=r'\$([\d,.]+)/sqft$') + details.add_xpath('days_on_Trulia', xpath='.//li//text()', re=r'([\d,]+)\+? Days on Trulia$') + overview_dict = il.load_item() - public_records = l.nested_xpath('//span[text() = "PUBLIC RECORDS"]') - public_records.add_xpath('public_records', './parent::div/following-sibling::ul[1]/li/text()') - public_records.add_xpath('public_records_date_updated', './following-sibling::span/text()', re=r'^Updated: (.*)') + # local info + local_info_list = response.xpath( + '(//*[div="Local Information"]/parent::div)[2]/following-sibling::div/div/div//text()').extract() + # for i in range(len(local_info_list) - 1, -1, -1): + # if "Map View" in local_info_list[i] or "Street View" in local_info_list[i]: + # local_info_list.remove(local_info_list[i]) + local_dict_values = '\n'.join(local_info_list) - item = l.load_item() - self.post_process(item=item) - return item + # price_history + il = ItemLoader(item=price_item(), response=response) + table_xpath = '//div[contains(text(), "Price History for")]/../../following-sibling::table' + il.add_xpath('dates', table_xpath + '//tr[1]/td[1]//text()') + il.add_xpath('prices', table_xpath + '//tr[1]/td[2]//text()') + il.add_xpath('events', table_xpath + '//tr[1]/td[3]//text()') + price_dict = il.load_item() - @staticmethod - def load_common_fields(item_loader, response): - '''Load field values which are common to "on sale" and "recently sold" properties.''' - item_loader.add_value('url', response.url) - item_loader.add_xpath('address', '//*[@data-role="address"]/text()') - item_loader.add_xpath('city_state', '//*[@data-role="cityState"]/text()') - item_loader.add_xpath('price', '//span[@data-role="price"]/text()', re=r'\$([\d,]+)') - item_loader.add_xpath('neighborhood', '//*[@data-role="cityState"]/parent::h1/following-sibling::span/a/text()') - details = item_loader.nested_css('.homeDetailsHeading') - overview = details.nested_xpath('.//span[contains(text(), "Overview")]/parent::div/following-sibling::div[1]') - overview.add_xpath('overview', xpath='.//li/text()') - overview.add_xpath('area', xpath='.//li/text()', re=r'([\d,]+) sqft$') - overview.add_xpath('lot_size', xpath='.//li/text()', re=r'([\d,.]+) (?:acres|sqft) lot size$') - overview.add_xpath('lot_size_units', xpath='.//li/text()', re=r'[\d,.]+ (acres|sqft) lot size$') - overview.add_xpath('price_per_square_foot', xpath='.//li/text()', re=r'\$([\d,.]+)/sqft$') - overview.add_xpath('bedrooms', xpath='.//li/text()', re=r'(\d+) (?:Beds|Bed|beds|bed)$') - overview.add_xpath('bathrooms', xpath='.//li/text()', re=r'(\d+) (?:Baths|Bath|baths|bath)$') - overview.add_xpath('year_built', xpath='.//li/text()', re=r'Built in (\d+)') - overview.add_xpath('days_on_Trulia', xpath='.//li/text()', re=r'([\d,]) days on Trulia$') - overview.add_xpath('views', xpath='.//li/text()', re=r'([\d,]+) views$') - item_loader.add_css('description', '#descriptionContainer *::text') - - price_events = details.nested_xpath('.//*[text() = "Price History"]/parent::*/following-sibling::*[1]/div/div') - price_events.add_xpath('prices', './div[contains(text(), "$")]/text()') - price_events.add_xpath('dates', './div[contains(text(), "$")]/preceding-sibling::div/text()') - price_events.add_xpath('events', './div[contains(text(), "$")]/following-sibling::div/text()') - @staticmethod - def post_process(item): - '''Add any additional data to an item after loading it''' - if item.get('dates') is not None: - dates = [datetime.datetime.strptime(date, '%m/%d/%Y') for date in item['dates']] - prices = [int(price.lstrip('$').replace(',', '')) for price in item['prices']] - item['price_history'] = sorted(list(zip(dates, prices, item['events'])), key=lambda x: x[0]) + # tax info + il = ItemLoader(item=taxes_item(), response=response) + table_xpath = '//*[div="Property Taxes and Assessment"]/parent::div/following-sibling::table' + il.add_xpath('property_tax_assessment_year', table_xpath + '//tr[1]/td[1]//text()') + il.add_xpath('property_tax', table_xpath + '//tr[2]/td[1]//text()') + il.add_xpath('property_tax_assessment_land', table_xpath + '//tr[4]/td[1]//text()') + il.add_xpath('property_tax_assessment_improvements', table_xpath + '//tr[5]/td[1]//text()') + il.add_xpath('property_tax_assessment_total', table_xpath + '//tr[6]/td[1]//text()') + tax_dict = il.load_item() + + # 有的“可比较”模块不存在 + comparable_path = '//div[contains(text(), "Comparable Sales")]/../../following-sibling::div[3]' + header = response.xpath(comparable_path + '//th//text()').extract() + header.append('url') + num_tr = len(response.xpath(comparable_path + '//tbody/tr')) + rows = [] + for i in range(1, num_tr+1): + rows.append(response.xpath((comparable_path + '//tbody/tr[{:d}]//text()').format(i)).extract()) + urls = response.xpath(comparable_path + '//tbody//a/@href').extract() + urls = [get_rel_url(response.url, url) for url in urls] + [rows[i].append(urls[i]) for i in range(num_tr)] + comparable_list = [list(zip(header, row)) for row in rows] + + # price_trends + il = ItemLoader(item=price_trends_item(), response=response) + price_trend_node = il.nested_xpath('//*[div="Price Trends"]/parent::div/following-sibling::div[1]') + price_trend_node.add_xpath('item1', './*[3]//text()') + price_trend_node.add_xpath('item2', './*[4]//text()') + price_trend_node.add_xpath('item3', './*[5]//text()') + price_trends_dict = il.load_item() + price_trends = '\n'.join(list(price_trends_dict.values())) + + # local common + total_reviews = [] + reviews = [] + review_count = response.xpath('count(//div[@data-testid="wls-responisve-slider"]/div/div/child::node())').extract()[0] + review_count = int(float(review_count)) + for i in range(1, 1 + review_count): + reviews.append(' '.join( + response.xpath('//div[@data-testid="wls-responisve-slider"]/div/div/*[{:d}]//text()'.format(i)).extract())) + reviews = '\n'.join(reviews) + common_count = response.xpath('count(//div[@data-testid="what-locals-say"]/child::node())').extract()[0] + common_count = int(float(common_count)) + for i in range(1, common_count): + total_reviews.append(' '.join( + response.xpath('//div[@data-testid="what-locals-say"]/*[{:d}]//text()'.format(i)).extract())) + total_reviews.append(reviews) + + #similar_house + base_xpath = '//*[div="Similar Homes You May Like"]/parent::div/following-sibling::div[1]/div/div' + similar_house = self.get_similar_new_part(base_xpath, response) + + # new linking house + base_xpath = '//div[contains(text(), "New Listings near")]/../../following-sibling::div[1]/div/div' + new_link_house = self.get_similar_new_part(base_xpath, response) + + # all new homes + builder_tr_count = response.xpath('count(//table[@data-testid="quick-movein-builder-homes-table"]//tr)').extract()[0] + builder_tr_count = int(float(builder_tr_count)) + builder_tables = [] + for i in range(1, 1 + builder_tr_count): + builder_tables.append(response.xpath( + '//table[@data-testid="quick-movein-builder-homes-table"]//tr[{:d}]/td//text()'.format(i)).extract()) + + builder_plans = [] + for i in range(1, 1 + builder_tr_count): + builder_plans.append(response.xpath( + '//table[@data-testid="planned-builder-homes-table"]//tr[{:d}]/td//text()'.format(i)).extract()) + + new_homes = {} + if len(builder_tables) > 0: + new_homes['quick-movein-builder'] = builder_tables + if len(builder_plans) > 0: + new_homes['planned-builder'] = builder_plans + + + + il = ItemLoader(item=TruliaItem(), response=response) + # home detail + il.add_xpath('home_detail', + '//div[contains(text(), "Home Details for")]/../../following-sibling::ul/li//text()') + + # description + il.add_xpath('description', + '(//*[div="Description"]/parent::div)[2]/following-sibling::div//text()') + + il.add_xpath('community_description', + '//div[@data-testid="community-description-text-description-text"]//text()') + il.add_xpath('office_hours', + '//div[@data-testid="office-hours-container"]//text()') + il.add_xpath('open_house', + '//div[@data-testid="open-house-container"]//text()') + # local_commons + + item = il.load_item() + + # price_history may not exist + try: + dates = [datetime.datetime.strptime(date, '%m/%d/%Y') for date in price_dict['dates']] + prices = [int(price.lstrip('$').replace(',', '')) for price in price_dict['prices']] + item['price_history'] = sorted(list(zip(dates, prices, price_dict['events'])), key=lambda x: x[0]) + except: + item['price_history'] = [] + + # overview + item['overview'] = overview_dict + + # property_tax may not exist + item['property_taxes'] = tax_dict + + #local_view + item['local_information'] = local_dict_values + + # price_trends + item['price_trends'] = price_trends + + # comparable_sales + item['comparable_sales'] = comparable_list + + # local_commons + item['local_commons'] = total_reviews + + # similar house + item['similar_homes'] = similar_house + + # new_link house + item['new_listing'] = new_link_house + + # new homes + item['new_homes'] = new_homes + return item + + @staticmethod + def get_similar_new_part(base_xpath, response): + result_list = [] + child_num = len(response.xpath(base_xpath + '/child::node()')) + for i in range(1, child_num): + il = ItemLoader(item=basic_info_item(), response=response) + result_list.append(TruliaSpider.get_basic_house_info(il, str(i), response.url, + base_xpath)) + return result_list + + + @staticmethod + def get_basic_house_info(item_loader, nth, url, base_xpath): + basic_info = item_loader.nested_xpath(base_xpath) + basic_info.add_xpath('price', xpath='./*[{}]//text()'.format(nth), re=r'\$([\d,]+)') + basic_info.add_xpath('area', xpath='./*[{}]//text()'.format(nth), re=r'^([\d,]+)\s?sqft$') + basic_info.add_xpath('bedrooms', xpath='./*[{}]//text()'.format(nth), re=r'(\d+)\s?(?:Beds|Bed|beds|bed|bd)$') + basic_info.add_xpath('bathrooms', xpath='./*[{}]//text()'.format(nth), re=r'(\d+\.?\d{0,})\s?(?:Baths|Bath|baths|bath|ba)$') + basic_info.add_xpath('city_state', xpath='./*[{}]//div[@data-testid="property-street"]/text()'.format(nth)) + basic_info.add_xpath('address', xpath='./*[{}]//div[@data-testid="property-region"]/text()'.format(nth)) + basic_info.add_xpath('url', xpath='./*[{}]//a/@href'.format(nth)) + basic_house = item_loader.load_item() + basic_house['url'] = get_rel_url(url, basic_house['url']) + return basic_house \ No newline at end of file diff --git a/trulia_scraper/spiders/trulia_sold.py b/trulia_scraper/spiders/trulia_sold.py index d973d3b..ef3c25d 100644 --- a/trulia_scraper/spiders/trulia_sold.py +++ b/trulia_scraper/spiders/trulia_sold.py @@ -1,49 +1,49 @@ -# -*- coding: utf-8 -*- -import os -import scrapy -from scrapy.linkextractors import LinkExtractor -import trulia_scraper.parsing as parsing -from trulia_scraper.items import TruliaItem, TruliaItemLoader -import trulia_scraper.spiders.trulia as trulia -from scrapy.utils.conf import closest_scrapy_cfg - - -class TruliaSpider(scrapy.Spider): - name = 'trulia_sold' - allowed_domains = ['trulia.com'] - custom_settings = {'FEED_URI': os.path.join(os.path.dirname(closest_scrapy_cfg()), 'data/data_sold_%(state)s_%(city)s_%(time)s.jl'), - 'FEED_FORMAT': 'jsonlines'} - - def __init__(self, state='CA', city='San_Francisco', *args, **kwargs): - super().__init__(*args, **kwargs) - self.state = state - self.city = city - self.start_urls = ['http://trulia.com/sold/{city},{state}/'.format(state=state, city=city)] - self.le = LinkExtractor(allow=r'^https://www.trulia.com/homes/.+/sold/') - - def parse(self, response): - N = trulia.TruliaSpider.get_number_of_pages_to_scrape(response) - self.logger.info("Determined that property pages are contained on {N} different index pages, each containing at most 30 properties. Proceeding to scrape each index page...".format(N=N)) - for url in [response.urljoin("{n}_p/".format(n=n)) for n in range(1, N+1)]: - yield scrapy.Request(url=url, callback=self.parse_index_page) - - def parse_index_page(self, response): - for link in self.le.extract_links(response): - yield scrapy.Request(url=link.url, callback=self.parse_property_page) - - def parse_property_page(self, response): - item_loader = TruliaItemLoader(item=TruliaItem(), response=response) - trulia.TruliaSpider.load_common_fields(item_loader=item_loader, response=response) - - details = item_loader.nested_css('.homeDetailsHeading') - taxes = details.nested_xpath('.//*[text() = "Property Taxes and Assessment"]/parent::div') - taxes.add_xpath('property_tax_assessment_year', './following-sibling::div/div[contains(text(), "Year")]/following-sibling::div/text()') - taxes.add_xpath('property_tax', './following-sibling::div/div[contains(text(), "Tax")]/following-sibling::div/text()') - taxes.add_xpath('property_tax_assessment_land', './following-sibling::div/div/div[contains(text(), "Land")]/following-sibling::div/text()') - taxes.add_xpath('property_tax_assessment_improvements', './following-sibling::div/div/div[contains(text(), "Improvements")]/following-sibling::div/text()') - taxes.add_xpath('property_tax_assessment_total', './following-sibling::div/div/div[contains(text(), "Total")]/following-sibling::div/text()') - taxes.add_xpath('property_tax_market_value', './following-sibling::div/div[contains(text(), "Market Value")]/following-sibling::div/text()') - - item = item_loader.load_item() - trulia.TruliaSpider.post_process(item=item) - return item +# # -*- coding: utf-8 -*- +# import os +# import scrapy +# from scrapy.linkextractors import LinkExtractor +# import trulia_scraper.parsing as parsing +# from trulia_scraper.items import TruliaItem, TruliaItemLoader +# import trulia_scraper.spiders.trulia as trulia +# from scrapy.utils.conf import closest_scrapy_cfg +# +# +# class TruliaSpider(scrapy.Spider): +# name = 'trulia_sold' +# allowed_domains = ['trulia.com'] +# custom_settings = {'FEED_URI': os.path.join(os.path.dirname(closest_scrapy_cfg()), 'data/data_sold_%(state)s_%(city)s_%(time)s.jl'), +# 'FEED_FORMAT': 'jsonlines'} +# +# def __init__(self, state='CA', city='San_Francisco', *args, **kwargs): +# super().__init__(*args, **kwargs) +# self.state = state +# self.city = city +# self.start_urls = ['http://trulia.com/sold/{city},{state}/'.format(state=state, city=city)] +# self.le = LinkExtractor(allow=r'^https://www.trulia.com/homes/.+/sold/') +# +# def parse(self, response): +# N = trulia.TruliaSpider.get_number_of_pages_to_scrape(response) +# self.logger.info("Determined that property pages are contained on {N} different index pages, each containing at most 30 properties. Proceeding to scrape each index page...".format(N=N)) +# for url in [response.urljoin("{n}_p/".format(n=n)) for n in range(1, N+1)]: +# yield scrapy.Request(url=url, callback=self.parse_index_page) +# +# def parse_index_page(self, response): +# for link in self.le.extract_links(response): +# yield scrapy.Request(url=link.url, callback=self.parse_property_page) +# +# def parse_property_page(self, response): +# item_loader = TruliaItemLoader(item=TruliaItem(), response=response) +# trulia.TruliaSpider.load_common_fields(item_loader=item_loader, response=response) +# +# details = item_loader.nested_css('.homeDetailsHeading') +# taxes = details.nested_xpath('.//*[text() = "Property Taxes and Assessment"]/parent::div') +# taxes.add_xpath('property_tax_assessment_year', './following-sibling::div/div[contains(text(), "Year")]/following-sibling::div/text()') +# taxes.add_xpath('property_tax', './following-sibling::div/div[contains(text(), "Tax")]/following-sibling::div/text()') +# taxes.add_xpath('property_tax_assessment_land', './following-sibling::div/div/div[contains(text(), "Land")]/following-sibling::div/text()') +# taxes.add_xpath('property_tax_assessment_improvements', './following-sibling::div/div/div[contains(text(), "Improvements")]/following-sibling::div/text()') +# taxes.add_xpath('property_tax_assessment_total', './following-sibling::div/div/div[contains(text(), "Total")]/following-sibling::div/text()') +# taxes.add_xpath('property_tax_market_value', './following-sibling::div/div[contains(text(), "Market Value")]/following-sibling::div/text()') +# +# item = item_loader.load_item() +# trulia.TruliaSpider.post_process(item=item) +# return item diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..fc2eabe --- /dev/null +++ b/utils.py @@ -0,0 +1,18 @@ +# -*- coding: utf-8 -*- +# @Time : 19-10-22 下午3:46 +# @Author : RenMeng + +from urllib.parse import urljoin +from urllib.parse import urlparse +from urllib.parse import urlunparse +from posixpath import normpath + + +def get_rel_url(cur_url, rel_url): + if not rel_url.startswith("./") and not rel_url.startswith("/"): + return rel_url + url1 = urljoin(cur_url, rel_url) + arr = urlparse(url1) + path = normpath(arr[2]) + return urlunparse((arr.scheme, arr.netloc, path, arr.params, arr.query, arr.fragment)) + From b8575579c4b81e03dabd615bd80e1862687066f0 Mon Sep 17 00:00:00 2001 From: RenMeng <853750873@qq.com> Date: Fri, 1 Nov 2019 10:46:16 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E6=B7=BB=E5=8A=A0red=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?=E7=88=AC=E8=99=AB=20for=20redfin?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 8 +- csv_to_json.py | 43 +++ proxies.txt | 0 redfin.py | 532 +++++++++++++++++++++++++++++++ redfin_run.py | 9 + trulia_scraper/spiders/trulia.py | 4 +- 6 files changed, 593 insertions(+), 3 deletions(-) create mode 100644 csv_to_json.py create mode 100644 proxies.txt create mode 100644 redfin.py create mode 100644 redfin_run.py diff --git a/README.md b/README.md index 5ef7732..848b994 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,12 @@ Details please refer to [khpeek](https://github.com/khpeek/trulia-scraper) ## Basic usage 1. `trulia`, which scrapes all real estate listings which are _for sale_ in a given state and city starting from a URL such as [https://www.trulia.com/CA/San_Francisco/](https://www.trulia.com/CA/San_Francisco/); +'scrapy crawl trulia -output data.csv' ## To do -1. `trulia_sold`, which similarly scrapes listings of recently _sold_ properties starting from a URL such as [https://www.trulia.com/sold/San_Francisco,CA/](https://www.trulia.com/sold/San_Francisco,CA/). \ No newline at end of file +1. `trulia_sold`, which similarly scrapes listings of recently _sold_ properties starting from a URL such as [https://www.trulia.com/sold/San_Francisco,CA/](https://www.trulia.com/sold/San_Francisco,CA/). + + +# Selenium+Bs4 for redfin +## Basic usage +`python redfin_run.py` \ No newline at end of file diff --git a/csv_to_json.py b/csv_to_json.py new file mode 100644 index 0000000..63d58bf --- /dev/null +++ b/csv_to_json.py @@ -0,0 +1,43 @@ +# -*- coding: utf-8 -*- +# @Time : 19-10-31 下午3:31 +# @Author : RenMeng + +import pandas as pd +import json +import re + +data = pd.read_csv('data.csv') +data = data.fillna('') +nrows = data.shape[0] + +for j in range(nrows): + + webdata = data.iloc[j].to_dict() + tars = ['overview', 'property_taxes', 'new_homes', + 'price_history', 'similar_homes', 'new_listing', 'comparable_sales'] + + for key in webdata: + if key == 'local_commons': + webdata[key] = re.sub('[^0-9A-Za-z!\?\.,:\"\"\'\' \n]', '', webdata[key]) + if key in tars and webdata[key] != '': + try: + webdata[key] = eval(webdata[key]) + except: + v = webdata[key] + v = re.findall('\((datetime.*?[^0-9])\)', v) + new_v = [] + for _ele in v: + new_ele = re.sub('\)|datetime\.datetime\(', '', _ele).split(',') + new_v.append(['-'.join([i.strip() for i in new_ele[:3]]), new_ele[-2], eval(new_ele[-1])]) + webdata[key] = new_v + elif key != '': + ele = webdata[key].split('\n') + if len(ele) > 1: + webdata[key] = ele + + if key == 'comparable_sales': + webdata[key] = [{ele[0]: ele[1] for ele in line} for line in webdata[key]] + + + open('./result/trulia/trulia_output_{:d}.json'.format(j), 'w', encoding='utf-8').\ + write(json.dumps(webdata, indent=4, ensure_ascii=False)) \ No newline at end of file diff --git a/proxies.txt b/proxies.txt new file mode 100644 index 0000000..e69de29 diff --git a/redfin.py b/redfin.py new file mode 100644 index 0000000..186b735 --- /dev/null +++ b/redfin.py @@ -0,0 +1,532 @@ +# -*- coding: utf-8 -*- +# @Time : 19-10-28 上午10:41 +# @Author : RenMeng + +# coding=utf-8 + +# from selenium import webdriver +# url = 'https://www.google.com/recaptcha/api2/demo' +# browser = webdriver.Chrome() +# browser.get(url) +# browser.find_elements_by_tag_name("iframe")[0] + + +from time import sleep +import requests +from selenium import webdriver +from bs4 import BeautifulSoup +from collections import OrderedDict +import re +import json +from random import choice, randint +from selenium.webdriver import Firefox +from selenium.webdriver.firefox.webdriver import FirefoxProfile +from bs4 import NavigableString + +import urllib3 +urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) + +reg_property_history_row = re.compile('propertyHistory\-[0-9]+') +reg_offerinsight_row = re.compile('offerInsights\-[0-9]+') +reg_property_urls = re.compile('(/[A-Z][A-Z]/[A-Za-z\-/0-9]+/home/[0-9]+)') +user_agent_header = { + 'User-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.81 Safari/537.36'} + + +class RedFin(): + def __init__(self): + self.start_url = 'https://www.redfin.com/zipcode/94043' + # self.session = requests.Session() + self.use_selenium = True + # proxy option can be set after class object is loaded + # self.use_proxies = True + self.output_data = [] + self.property_urls = [] + PROXY = "http://127.0.0.1:8118" # IP:PORT or HOST:PORT + chrome_options = webdriver.ChromeOptions() + chrome_options.add_argument('--proxy-server=%s' % PROXY) + self.driver = webdriver.Chrome(options=chrome_options) + + # load proxies from file one per line proxy:port format + # self.proxies = [l.rstrip() for l in open('proxies.txt').readlines()] + # make a separate session for each proxy + # self.sessions = {} + # for proxy in self.proxies: + # self.sessions[proxy] = { + # 'session': requests.Session(), + # 'proxy': {'http': 'http://' + proxy, + # 'https': 'https://' + proxy} + # } + # load data collected so far in order to avoid needing to scrape + # the same data twice + # try: + # self.output_data = json.loads(open('redfin_output.json').read()) + # except: + # self.output_data = [] + + def rand_sleep(self): + # you can set the random sleep time for no browser mode here + sleep(randint(5, 10)) + + def parse_finished_urls(self): + # function for removing urls that have already completed + done_urls_list = set() + for property_data in self.output_data: + url = property_data['url'][22:] + done_urls_list.add(url) + if url in self.property_urls: self.property_urls.remove(url) + print(str(len(done_urls_list)) + ' properties already done') + print(str(len(self.property_urls)) + ' proeprties to go') + + def get_search_results(self): + page_source = self.request_search_page(self.start_url) + self.property_urls = reg_property_urls.findall(page_source.replace('\\u002F', '/')) + self.property_urls = list(set(self.property_urls)) + print('found ' + str(len(self.property_urls)) + ' results') + self.parse_finished_urls() + + def request_search_page(self, page_url): + if self.use_selenium: + self.use_selenium = False + return self.get_page_selenium(page_url) + else: + return self.make_page_request(page_url) + + def get_property_data(self): + count = 0 + for property_url in self.property_urls: + webdata = self.get_property_page(property_url) + count += 1 + open('./result/redfin_output_{:d}.json'.format(count), 'w', encoding='utf-8').\ + write(json.dumps(webdata, indent=4, ensure_ascii=False)) + print('finished page ' + str(count)) + self.output_data.append(webdata) + + def make_page_request(self, property_url): + self.rand_sleep() + if self.use_selenium: + return self.get_page_selenium('https://www.redfin.com' + property_url) + # elif self.use_proxies: + # return self.make_page_request_proxy(property_url) + else: + return self.make_page_request_no_proxy('https://www.redfin.com' + property_url) + + def make_page_request_no_proxy(self, property_url): + # use a loop to handle various http request errors and retry + # if 10 fails reached assume we've been blcoked + # for i in range(10): + # try: + # http_response = self.session.get(property_url, headers=user_agent_header, verify=False) + # if http_response.status_code == 200: break + # except Exception as e: + # print(1, 'Request error') + # if i == 9: print(1, 'blocked error');exit() + # return http_response.text + self.driver.get(property_url) + return self.driver.page_source + + def make_page_request_proxy(self, property_url): + # use a loop to handle various http request errors and retry + # if 10 fails reached assume we've been blcoked + for i in range(10): + try: + session = self.sessions[choice(self.proxies)] + http_response = session['session'].get(property_url, headers=user_agent_header, + proxies=session['proxy'], verify=False) + if http_response.status_code == 200: break + except Exception as e: + print(2, 'Request error') + if i == 9: print(2, 'blocked error');exit() + return http_response.text + + def get_property_page(self, property_url): + page_source = self.make_page_request(property_url) + return self.parse_property_page(page_source, property_url) + + def parse_property_page(self, page_source, property_url): + self.soup = BeautifulSoup(page_source, 'html.parser') + property_data = OrderedDict() + + # basic_info + property_data['basic_info'] = OrderedDict() + property_data['basic_info']['url'] = 'https://www.redfin.com' + property_url + # use try catch to handle when a data point is not available + try: + property_data['basic_info']['street_address'] = \ + self.soup.find('span', attrs={'class': 'street-address'}).get_text() + except: + print('street_address not found') + + try: + property_data['basic_info']['address_locality'] = \ + self.soup.find('span', attrs={'class': 'citystatezip'}).get_text() + except: + print('address_locality not found') + + try: + property_data['basic_info']['price'] = \ + self.soup.find('div', attrs={'class': 'info-block price'}).find('div').get_text() + except: + print('price not found') + + try: + property_data['basic_info']['beds'] = \ + self.soup.find('div', attrs={'data-rf-test-id': 'abp-beds'}).find('div').get_text() + except: + print('beds not found') + + try: + property_data['basic_info']['baths'] = \ + self.soup.find('div', attrs={'data-rf-test-id': 'abp-baths'}).find('div').get_text() + except: + print('baths not found') + + try: + property_data['basic_info']['sqFt'] = \ + ' '.join([item.get_text() for item in + (self.soup.select('span.statsValue') + self.soup.select('span.sqft-label'))]) + except: + print('sqFt not found') + + try: + property_data['basic_info']['price_per_sqFt'] = \ + self.soup.find('div', attrs={'data-rf-test-id': 'abp-sqFt'}).\ + find('div', attrs={"data-rf-test-id": "abp-priceperft"}).get_text() + except: + print('price_per_sqFt not found') + + try: + property_data['basic_info']['redfin estimate'] =\ + self.soup(text=re.compile('Redfin Estimate:'))[0].parent.parent.parent.\ + next_sibling.get_text() + except: + print('redfin estimate not found') + + try: + property_data['basic_info']['days on Redfin'] = \ + self.soup(text=re.compile('On Redfin'))[0].parent.next_sibling.get_text() + except: + print('days on Redfin not found') + + try: + property_data['basic_info']['year_built'] = \ + self.soup.find('span', attrs={"data-rf-test-id": "abp-yearBuilt"}).\ + find('span', attrs={'class': 'value'}).get_text() + except: + print('year_built not found') + + try: + property_data['basic_info']['status'] = \ + self.soup.find('span', attrs={"data-rf-test-id": "abp-status"}).\ + find('span', attrs={'class': 'value'}).get_text() + except: + print('status not found') + + # overview + overview = {} + try: + overview['describe'] = self.soup.select('div.house-info')[0].\ + select('div[class*="remarks"]')[0].get_text() + except: + overview['describe'] = 'not found' + details = OrderedDict() + try: + for child in self.soup.find('div', attrs={'class': 'keyDetailsList'}).children: + cells = list(child.children) + details[cells[0].get_text().strip()] = cells[1].get_text().strip() + except: + pass + overview['detail'] = details + property_data['overview'] = overview + + # use loops to maintain data structure ina dict + property_data['property_details'] = OrderedDict() + try: + for category in self.soup.find('div', attrs={'class': 'amenities-container'}).children: + if category.get('class')[0] == 'super-group-title': + key = category.contents[0] + elif category.get('class')[0] == 'super-group-content': + property_data['property_details'][key] = OrderedDict() + for row in category.find_all('div', attrs={'class': 'amenity-group'}): + key2 = row.find('h3').get_text() + property_data['property_details'][key][key2] = [] + for row2 in row.find_all('li'): + property_data['property_details'][key][key2].append(row2.get_text()) + except: + pass + + property_data['propert_history'] = [] + try: + for row in self.soup.find_all('tr', attrs={'id': reg_property_history_row}): + data_cells = row.find_all('td') + history_data_row = OrderedDict() + history_data_row['date'] = data_cells[0].get_text() + history_data_row['event & source'] = data_cells[1].get_text() + history_data_row['price'] = data_cells[2].get_text() + history_data_row['appreciation'] = data_cells[3].get_text() + property_data['propert_history'].append(history_data_row) + except: + pass + + + property_data['school'] = OrderedDict() + try: + school_tabs = [item.get_text() for item in self.soup.find('div', attrs={'class':'scrollable tabs'})] + for tab in school_tabs: + self.driver.find_element_by_xpath('//button[text()="{}"]'.format(tab)).click() + self.soup = BeautifulSoup(self.driver.page_source, 'html.parser') + school_table = self.soup.select('div.schools-content')[0].select('tr.schools-table-row') + thead = [item.get_text() for item in school_table[0].find_all('th')] + tbody = [[item for item in trow.find_all('td')] for trow in school_table[1:]] + col_num = len(thead) + tbody = [[row[i].find('div', attrs={'data-tf-test-name', 'school-name'}).get_text() + if i == 0 else row[i].get_text() for i in range(col_num)] for row in tbody] + school_item = [{thead[i]: row[i] for i in range(col_num)} for row in tbody] + property_data['school'][tab] = school_item + except: + pass + + property_data['insights'] = [] + try: + for item in self.soup.find('div', + attrs={'data-rf-test-id': 'tourInsights'}).select('div.currentTourInsights')[0].children: + common = OrderedDict() + common['note'] = item.select('div.note')[0].get_text() + common['agent-info'] = item.select('div.agent-info')[0].find('div').get_text() + common['date'] = item.select('div.agent-info')[0].find('span', attrs={'class': 'date'}).get_text() + property_data['insights'].append(common) + except: + pass + + property_data['activity'] = [] + try: + for item in self.soup.find('div', + attrs={'data-rf-test-id': 'activitySection'}).find_all('td'): + property_data['activity'].append(' '.join([child.get_text() for child in item.select('div.labels')[0].contents])) + except: + pass + + property_data['public-facts'] = OrderedDict() + try: + for child in self.soup.select('div.public-records-taxes')[0].children: + key = child.find('h3').get_text() + property_data['public-facts'][key] = OrderedDict() + for tr in child.find_all('tr'): + cells = list(tr.children) + property_data['public-facts'][key][cells[0].get_text()] = cells[1].get_text() + property_data['public-facts']['home-facts'] = OrderedDict() + for child in self.soup.select('div.facts-table')[0].select('div.table-row'): + cells = list(child.contents) + property_data['public-facts']['home-facts'][cells[0].get_text()] = cells[1].get_text() + except: + pass + + try: + for child in self.soup.select('div.public-records-taxes')[0].children: + key = child.find('h3').get_text() + property_data['public-facts'][key] = OrderedDict() + for tr in child.find_all('tr'): + cells = list(tr.children) + property_data['public-facts'][key][cells[0].get_text()] = cells[1].get_text() + property_data['public-facts']['home-facts'] = OrderedDict() + for child in self.soup.select('div.facts-table')[0].select('div.table-row'): + cells = list(child.contents) + property_data['public-facts']['home-facts'][cells[0].get_text()] = cells[1].get_text() + except: + pass + + try: + key = self.soup.select('#redfin-estimate')[0].find('h2').get_text() + property_data[key] = OrderedDict() + property_data[key]['EstimateValue'] = self.soup.select('#redfin-estimate')[0].select( + 'div[class*="RedfinEstimateValueHeader"]')[0].get_text() + property_data[key]['PriceDiff'] = self.soup.select('#redfin-estimate')[0].select( + 'div[class*="listPriceDiff"]')[0].get_text() + property_data[key]['comps'] = OrderedDict() + property_data[key]['comps']['based_on'] = self.soup.select('#redfin-estimate')[0].select( + 'div.comps')[0].contents[0].get_text() + property_data[key]['comps']['homecard'] = [] + for node in self.soup.select('#redfin-estimate')[0].select('div.comps')[0].select( + 'div.homecard'): + card = {} + card['url'] = 'https://www.redfin.com' + node.find('a')['href'] + card['sold_date'] = [item.get_text() for item in node.select('div.topleft')[0]] + card['details'] = [item.get_text() for item in node.select('div.left')[0]] + \ + [item.get_text() for item in node.select('div.right')[0].contents[0].children] + property_data[key]['comps']['homecard'].append(card) + except: + print('redfin-estimate not found') + + + # try: + # key = self.soup.select('#redfin-estimate')[0].find('h2').get_text() + # property_data[key] = OrderedDict() + # except: + # pass + + try: + key = [] + for child in self.soup.find('div', attrs={'data-rf-test-id': 'neighborhoodSection'}).find('h2').children: + if isinstance(child, NavigableString): + key.append(child) + else: + key += [item.get_text().strip() for item in child.children if item.name != 'script'] + key = ' '.join(key) + property_data[key] = OrderedDict() + key2 = self.soup.find('div', attrs={'data-rf-test-id':'neighborhoodSection'}).select( + 'h3[class*="walkscore-header"]')[0].get_text().strip() + property_data[key][key2] = [] + for child in self.soup.find('div', attrs={'data-rf-test-id': 'neighborhoodSection'}).select( + 'div.walk-score')[0].select('div.scrollable')[0].contents[0].children: + property_data[key][key2].append(' '.join([i.get_text() for i in child.children])) + desc = self.soup.find('div', attrs={'data-rf-test-id':'neighborhoodSection'}).\ + select('div.desc.blurb')[0].get_text() + property_data[key][key2].append(desc) + try: + key3 = self.soup.find('div', attrs={'data-rf-test-id':'neighborhoodSection'}).select( + 'div.OfferInsights')[0].find('h3').get_text() + property_data[key][key3] = OrderedDict() + for tr in self.soup.find('div', attrs={'data-rf-test-id': 'neighborhoodSection'}).select( + 'div.OfferInsights')[0].find('table', attrs={'class': 'basic-table'}).find_all('tr'): + for td in tr.find_all('td'): + cell = list(td.children) + property_data[key][key3][cell[0].get_text().strip()] = cell[1].get_text().strip() + except: + pass + + # 4th + try: + key4 = self.soup.find('div', attrs={'data-rf-test-id': 'neighborhoodSection'}).\ + select('div.title.primary-heading.h3')[0].get_text() + property_data[key][key4] = [] + for row in self.soup.find('div', attrs={'data-rf-test-id': 'neighborhoodSection'}).\ + find_all('li', attrs={'id': reg_offerinsight_row}): + line = OrderedDict() + target_value = ['offer-value', 'sale-date', 'home-stats', + 'offer-result-line', 'offer-insight', ] + for _v in target_value: + try: + line[_v] = row.select('div.{}'.format(_v))[0].get_text() + except: + pass + try: + line['agent-info'] = row.select('div.agent-info')[0].select('span.agent-detail-name')[0].get_text() + except: + pass + property_data[key][key4].append(line) + except: + pass + + # 5th + try: + key5 = self.soup.find('div', attrs={'data-rf-test-id':'neighborhoodSection'}).\ + select('div.statsAndChartsContainer')[0].find('h3').get_text() + property_data[key][key5] = [] + table = self.soup.find('div', attrs={'data-rf-test-id':'neighborhoodSection'}).\ + select('div.statsAndChartsContainer')[0].find('table', attrs={'class': 'basic-table'}) + header = [th.get_text() for th in table.find('thead').find_all('th')] + header_num = len(header) + for tr in table.find('tbody').find_all('tr'): + line = OrderedDict() + value = [td if isinstance(td, NavigableString) else td.get_text() + for td in tr.find_all('td')] + for i in range(header_num): + line[header[i]] = value[i] + property_data[key][key5].append(line) + + except: + pass + + except: + print('neighborhood info not found') + + try: + key = 'Nearby Similar Homes' + property_data[key] = OrderedDict() + try: + children = list(self.soup(text=re.compile(key))[1].parent.next_sibling.children) + except: + children = list(self.soup(text=re.compile(key))[0].parent.next_sibling.children) + + property_data[key]['desc'] = children[0].get_text() + property_data[key]['home_list'] = [] + for child in children[1].find_all('div', attrs={'class': 'SimilarHomeCardReact'}): + home_card = {} + home_card['url'] = 'https://www.redfin.com' + child.find('a')['href'] + details = [] + try: + details.append(child.select('div.topleft')[0].get_text()) + except: + pass + for item in child.select('div.bottomV2')[0].children: + if item.name == 'script': + continue + details += [i if isinstance(i, NavigableString) else i.get_text() for i in item.children] + print(details) + home_card['details'] = ' '.join(details) + property_data[key]['home_list'].append(home_card) + except: + print('similar list not found') + + try: + key = 'Nearby Recently Sold Homes' + property_data[key] = OrderedDict() + children = list(self.soup(text=re.compile(key))[0].parent.next_sibling.children) + property_data[key]['desc'] = children[0].get_text() + property_data[key]['home_list'] = [] + for child in children[1].find_all('div', attrs={'class': 'SimilarHomeCardReact'}): + home_card = {} + home_card['url'] = 'https://www.redfin.com' + child.find('a')['href'] + details = [] + try: + details.append(child.select('div.topleft')[0].get_text()) + except: + pass + for item in child.select('div.bottomV2')[0].children: + if item.name == 'script': + continue + details += [i if isinstance(i, NavigableString) else i.get_text() for i in item.children] + home_card['details'] = ' '.join(details) + property_data[key]['home_list'].append(home_card) + except: + print('recent sold not found') + print(property_data) + return property_data + + def use_browser(self): + self.use_selenium = True + firefox_profile = FirefoxProfile() + # might as well turn off images since we don't need them + if self.use_proxies: + # if use proxies is true load firefox with proxies + firefox_profile.set_preference("permissions.default.image", 2) + proxy_host, proxy_port = choice(self.proxies).split(':') + firefox_profile.set_preference("network.proxy.type", 1) + firefox_profile.set_preference("network.proxy.http", proxy_host) + firefox_profile.set_preference("network.proxy.http_port", int(proxy_port)) + firefox_profile.set_preference("network.proxy.ssl", proxy_host) + firefox_profile.set_preference("network.proxy.ssl_port", int(proxy_port)) + self.driver = Firefox(firefox_profile) + self.driver.implicitly_wait(2) + + def get_page_selenium(self, page_url): + self.driver.get(page_url) + self.selenium_bypass_captcha() + return self.driver.page_source + + def selenium_bypass_captcha(self): + # basic code for handling captcha + # this requires the user to actually solve the captcha and then continue + # try: + print('do check.....') + self.driver.switch_to.frame(self.driver.find_elements_by_tag_name("iframe")[0]) + self.driver.find_element_by_class_name('recaptcha-checkbox-border').click() + print('solve captcha ( pop up only ) and press enter to continue') + input() + self.driver.switch_to.default_content() + self.driver.find_element_by_id('submit').click() + # except Exception as e: + # pass + + + diff --git a/redfin_run.py b/redfin_run.py new file mode 100644 index 0000000..03ad725 --- /dev/null +++ b/redfin_run.py @@ -0,0 +1,9 @@ +# -*- coding: utf-8 -*- +# @Time : 19-10-28 上午10:47 +# @Author : RenMeng + +from redfin import RedFin + +redfin = RedFin() +redfin.get_search_results() +redfin.get_property_data() \ No newline at end of file diff --git a/trulia_scraper/spiders/trulia.py b/trulia_scraper/spiders/trulia.py index 974f5a6..d2b6a2d 100644 --- a/trulia_scraper/spiders/trulia.py +++ b/trulia_scraper/spiders/trulia.py @@ -24,7 +24,7 @@ class TruliaSpider(scrapy.Spider): name = 'trulia' allowed_domains = ['trulia.com'] custom_settings = {'FEED_URI': os.path.join(os.path.dirname(closest_scrapy_cfg()), 'data/data_for_sale_%(state)s_%(city)s_%(time)s.jl'), - 'FEED_FORMAT': 'jsonlines', + 'FEED_FORMAT': 'json', 'FEED_EXPORT_FIELDS': cols} def __init__(self, state='CA', city='Mountain_View', *args, **kwargs): @@ -32,7 +32,7 @@ def __init__(self, state='CA', city='Mountain_View', *args, **kwargs): self.state = state self.city = city # self.start_urls = ['http://trulia.com/{state}/{city}'.format(state=state, city=city)] - self.start_urls = ['http://trulia.com/{state}/{city}'.format(state=state, city=city)] + self.start_urls = ['http://trulia.com/{state}/{city}/94043'.format(state=state, city=city)] # self.le = LinkExtractor(allow=[r'^https://www.trulia.com/p/ca', r'^https://www.trulia.com/property', r'^https://www.trulia.com/builder-community']) self.link_path = '//div[@data-testid="search-result-list-container"]//a/@href'