Basic Zillow Scraper
This is a basic Zillow scraper built with Python using just the Requests library. Hopefully, you find it somewhat helpful.
To scrape the complete data, I first scrape the Zillow page with all the listings. Then, I scrape individual listings to gather additional information. The tricky part for me is that not all the information is located in the same part of the JSON.
# -*- coding: utf-8 -*- """ Created on Mon Nov 11 13:28:07 2024 @author: alice """ import requests from bs4 import BeautifulSoup import json import time # import detailParser from lxml import html import random import os import argparse ###################################################################################### def getProxies(): base_proxy = "usa.rotating.proxyrack.net" # Generate a list of proxies for the range 10000 to 10099 stickyProxies = [f"{base_proxy}:{port}" for port in range(9001, 9051)] # Randomly select a proxy from the list # proxy = "usa.rotating.proxyrack.net:9000" proxy = random.choice(stickyProxies) proxies = { 'http': proxy, 'https': proxy, } return proxies # Extract property details def detailDataGrab(url): for i in range(10): proxies = getProxies() headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "accept-encoding": "gzip, deflate, br, zstd", "accept-language": "en-US,en;q=0.9", "cache-control": "max-age=0", "priority": "u=0, i", "referer": url, "sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"', "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": '"Windows"', "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-origin", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36" } # Make the GET request response = requests.get(url, headers=headers, proxies=proxies) status = response.status_code print(status) if status!=403: return response else: # print('try again') pass def getDetails(url): # url = 'https://www.zillow.com/homedetails/1415-Nicholas-Mnr-San-Antonio-TX-78258/83984537_zpid/' response = detailDataGrab(url) details = extract_property_details(response.text) # print(details) # result = json.dumps(details, indent=4) return details ###################################################################################### ######################################################################################### def grabdata(url, zipcode): # Define the headers for i in range(10): time.sleep(0.1) proxies = getProxies() headers = { "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "accept-encoding": "gzip, deflate, br, zstd", "accept-language": "en-US,en;q=0.9", "cache-control": "max-age=0", "priority": "u=0, i", "referer": "https://www.zillow.com/homes/for_sale/{}_rb/".format(zipcode), "sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"', "sec-ch-ua-mobile": "?0", "sec-ch-ua-platform": '"Windows"', "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-origin", "sec-fetch-user": "?1", "upgrade-insecure-requests": "1", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36" } # Make the GET request response = requests.get(url, headers=headers, proxies=proxies) status = response.status_code print(status) if status!=403: return response else: pass # print('try again') # file_path = "script_content.txt" # # Open the file in write mode and save the content # with open(file_path, "w", encoding="utf-8") as file: # file.write(script_content) # print(f"Script content has been saved to {file_path}") def parse(response): soup = BeautifulSoup(response.text, 'html.parser') # Find the script tag with id "__NEXT_DATA__" script_tag = soup.find('script', {'id': '__NEXT_DATA__'}) # Extract the text content of the script tag if script_tag: script_content = script_tag.string # Load the content as JSON json_data = json.loads(script_content) else: print("Script tag with id '__NEXT_DATA__' not found.") return None, [] try: listings = json_data['props']['pageProps']['searchPageState']['cat1']['searchResults']['listResults'] print(f"Found {len(listings)} listings on the page.") except KeyError as e: print(f"Error parsing listings: {e}") return None, [] return json_data, listings ############################################################################################ def nextpage(json_data): nextpage = json_data['props']['pageProps']['searchPageState']['cat1']['searchList']['pagination'] return nextpage def getFirst(listing): property_data = { 'statusType': listing.get('statusType', None), 'street': listing.get('addressStreet', None), 'city': listing.get('addressCity', None), 'state': listing.get('addressState', None), 'zip': listing.get('addressZipcode', None), 'bathroomCount': listing.get('baths', None), 'bedroomCount': listing.get('beds', None), 'livingArea': listing.get('area', None), 'latitude': listing.get('latLong', {}).get('latitude', None), 'longitude': listing.get('latLong', {}).get('longitude', None), 'price': listing.get('price', None), 'status': listing.get('statusType', None), 'listingKey': listing.get('id', None), 'imageUrl': listing.get('imgSrc', None), 'brokerageName': listing.get('brokerName', None), 'marketingStatusSimplified': listing.get('marketingStatusSimplifiedCd', None), 'rawHomeStatus': listing.get('rawHomeStatusCd', None), 'detailUrl': listing.get('detailUrl', None), 'countryCurrency': listing.get('countryCurrency', None), 'statusText': listing.get('statusText', None), 'isUndisclosedAddress': listing.get('isUndisclosedAddress', None), 'isZillowOwned': listing.get('isZillowOwned', None) } return property_data def saveToExcel(df): file_path = r"C:\Users\alice\OneDrive\Desktop\zillow_basic.csv" existing_df = pd.read_csv(file_path) # Reorder df to match the existing CSV headers df = df.reindex(columns=existing_df.columns) # Append without headers if file exists df.to_csv(file_path, mode='a', index=False, header=False) def extract_property_details(html_content): # Parse the HTML content tree = html.fromstring(html_content) # Extract the JSON data from the script tag with id "__NEXT_DATA__" script_content = tree.xpath('//script[@id="__NEXT_DATA__"]/text()') if not script_content: return {"error": "No property data found"} try: # Parse the JSON content data = json.loads(script_content[0]) # Navigate to the nested data structure gdp_client_cache = data.get("props", {}).get("pageProps", {}).get("componentProps", {}).get("gdpClientCache", {}) # if not gdp_client_cache: # return {"error": "gdpClientCache not found"} # Parse gdpClientCache if it's JSON-serializable if isinstance(gdp_client_cache, str): gdp_client_cache = json.loads(gdp_client_cache) # Extract property data data = gdp_client_cache extracted_details = {} key = next(iter(data)) data = data[key] # Property Details extracted_details['county'] = data['property'].get('county') extracted_details['countyFIPS'] = data['property'].get('countyFIPS') extracted_details['zipPlus4'] = data['property'].get('zipPlus4', '') # MLS Details extracted_details['appliances'] = data['property']['resoFacts'].get('appliances', []) extracted_details['architecturalStyle'] = data['property']['resoFacts'].get('architecturalStyle', '') extracted_details['buildingStyle'] = data['property'].get('homeType', '') extracted_details['condoFloorNumber'] = data['property'].get('condoFloorNumber', 0) extracted_details['coolingTypes'] = data['property']['resoFacts'].get('cooling', []) extracted_details['description'] = data['property'].get('description', '') extracted_details['directions'] = data['property'].get('directions', '') extracted_details['exteriorConstruction'] = data['property']['resoFacts'].get('exteriorFeatures', []) extracted_details['failedListingDate'] = data['property'].get('failedListingDate', '') extracted_details['floorCount'] = data['property']['resoFacts'].get('stories', 0) extracted_details['fullBathroomCount'] = data['property']['resoFacts'].get('bathroomsFull', 0) extracted_details['halfBathroomCount'] = data['property']['resoFacts'].get('bathroomsHalf', 0) # Handling interiorFeatures safely try: interior_features = data.get('property', {}).get('resoFacts', {}).get('interiorFeatures', []) if isinstance(interior_features, list): extracted_details['hasCeilingFan'] = any('ceiling fan' in feature.lower() for feature in interior_features if isinstance(feature, str)) extracted_details['hasVaultedCeiling'] = any('vaulted ceiling' in feature.lower() for feature in interior_features if isinstance(feature, str)) else: extracted_details['hasCeilingFan'] = False extracted_details['hasVaultedCeiling'] = False except Exception: extracted_details['hasCeilingFan'] = False extracted_details['hasVaultedCeiling'] = False extracted_details['heating'] = data['property']['resoFacts'].get('heating', []) extracted_details['initialListingStatus'] = data['property'].get('initialListingStatus', '') extracted_details['listingCategory'] = data['property'].get('listingCategory', '') extracted_details['lotSizeSquareFeet'] = data['property']['resoFacts'].get('lotSize', '') extracted_details['maxListPriceDate'] = data['property'].get('maxListPriceDate', '') extracted_details['minListPrice'] = data['property'].get('minListPrice', 0) extracted_details['minListPriceDate'] = data['property'].get('minListPriceDate', '') extracted_details['mlsId'] = data['property']['attributionInfo'].get('mlsId', '') extracted_details['mlsName'] = data['property']['attributionInfo'].get('mlsName', '') extracted_details['mlsNumber'] = data['property']['resoFacts'].get('listingId', '') extracted_details['neighborhood'] = data['property'].get('neighborhoodRegion', '') extracted_details['newConstruction'] = data['property'].get('newConstructionType', False) extracted_details['oneQuarterBathroomCount'] = data['property']['resoFacts'].get('bathroomsOneQuarter', 0) extracted_details['originalListingDate'] = data['property']['resoFacts'].get('onMarketDate', '') extracted_details['parkingSpaceCount'] = data['property']['resoFacts'].get('parkingCapacity', 0) extracted_details['partialBathroomCount'] = data['property']['resoFacts'].get('bathroomsPartial', 0) extracted_details['patioAndPorchFeatures'] = data['property']['resoFacts'].get('patioAndPorchFeatures', []) extracted_details['price'] = data['property'].get('zestimate', 0) extracted_details['daysOnMarket'] = data['property']['resoFacts'].get('cumulativeDaysOnMarket', 0) extracted_details['propertySubtype'] = data['property']['resoFacts'].get('propertySubType', '') extracted_details['propertyType'] = data['property'].get('homeType', '') extracted_details['rentalIndicator'] = data['property'].get('postingIsRental', False) extracted_details['rental'] = data['property'].get('postingIsRental', False) # Retained for clarity, but can be removed if redundant extracted_details['roofType'] = data['property']['resoFacts'].get('roofType', '') extracted_details['soldDate'] = data['property'].get('dateSold', '') extracted_details['soldPrice'] = data['property'].get('lastSoldPrice', 0) extracted_details['statusSubtype'] = data['property'].get('statusSubtype', '') extracted_details['subDivision'] = data['property'].get('subdivisionName', '') extracted_details['title'] = data['property'].get('title', '') extracted_details['totalBuildingAreaSquareFeet'] = data['property']['resoFacts'].get('buildingArea', 0) extracted_details['yearBuilt'] = data['property'].get('yearBuilt', 0) extracted_details['salePriceIsEstimated'] = 'zestimate' in data['property'].get('adTargets', '') extracted_details['realtorId'] = data['property'].get('realtorId', '') extracted_details['copyright'] = data['property'].get('copyright', '') # Brokerage Details # extracted_details['mls_brokerage_name'] = data['property']['attributionInfo'].get('brokerName', '') extracted_details['mls_brokerage_phoneNumber'] = data['property']['attributionInfo'].get('brokerPhoneNumber', '') extracted_details['mls_brokerage_address'] = data['property']['resoFacts'].get('associationName', '') extracted_details['mls_brokerage_email'] = data['property']['attributionInfo'].get('brokerEmail', '') extracted_details['mls_brokerage_websiteUrl'] = data['property']['attributionInfo'].get('brokerWebsiteUrl', '') property_data = data.get('property', {}) reso_facts = property_data.get('resoFacts', {}) attribution_info = property_data.get('attributionInfo', {}) listing_agents = attribution_info.get('listingAgents', []) if isinstance(listing_agents, list): if len(listing_agents) > 0: extracted_details['mls_listingAgents_name_1'] = listing_agents[0].get('memberFullName', '') extracted_details['mls_listingAgents_role_1'] = listing_agents[0].get('associatedAgentType', '') extracted_details['mls_listingAgents_primaryPhoneNumber_1'] = listing_agents[0].get('agentPhoneNumber', '') extracted_details['mls_listingAgents_email_1'] = listing_agents[0].get('agentEmail', '') else: extracted_details['mls_listingAgents_name_1'] = None extracted_details['mls_listingAgents_role_1'] = None extracted_details['mls_listingAgents_primaryPhoneNumber_1'] = None extracted_details['mls_listingAgents_email_1'] = None # School Details schools = property_data.get('schools', []) if isinstance(schools, list): if len(schools) > 0: extracted_details['mls_schools_name_1'] = schools[0].get('name', '') extracted_details['mls_schools_category_1'] = schools[0].get('category', '') extracted_details['mls_schools_district_1'] = schools[0].get('district', '') if len(schools) > 1: extracted_details['mls_schools_name_2'] = schools[1].get('name', '') extracted_details['mls_schools_category_2'] = schools[1].get('category', '') extracted_details['mls_schools_district_2'] = schools[1].get('district', '') if len(schools) > 2: extracted_details['mls_schools_name_3'] = schools[2].get('name', '') extracted_details['mls_schools_category_3'] = schools[2].get('category', '') extracted_details['mls_schools_district_3'] = schools[2].get('district', '') # Safe iteration for tax info tax_info = property_data.get('taxInfo', []) if isinstance(tax_info, list): if len(tax_info) > 0: extracted_details['mls_taxes_year_1'] = tax_info[0].get('year', '') extracted_details['mls_taxes_amount_1'] = tax_info[0].get('amount', 0) extracted_details['mls_taxes_description_1'] = tax_info[0].get('description', '') if len(tax_info) > 1: extracted_details['mls_taxes_year_2'] = tax_info[1].get('year', '') extracted_details['mls_taxes_amount_2'] = tax_info[1].get('amount', 0) extracted_details['mls_taxes_description_2'] = tax_info[1].get('description', '') extracted_details['monthlyHoaFee'] = data['property']['resoFacts'].get('hoaFee', '') extracted_details['propertyTaxRate'] = data['property'].get('taxRate', 0) extracted_details['parcelId'] = data['property'].get('parcelId', '') extracted_details['parcel_number'] = data['property'].get('parcel_number', '') # Image and Media Details # extracted_details['image_url'] = data['property'].get('imageUrl', '') extracted_details['image_url_uncropped'] = data['property'].get('imageUrlUncropped', '') # Tour and Contact Details extracted_details['mls_tourRequestTitle'] = data['property'].get('tourRequestTitle', '') extracted_details['mls_tourNextAvailableTime'] = data['property'].get('tourNextAvailableTime', '') extracted_details['mls_contactFormTitle'] = data['property'].get('contactFormTitle', '') # Safely process 'atAGlanceFacts' at_a_glance_facts = reso_facts.get('atAGlanceFacts', []) if isinstance(at_a_glance_facts, list): # Property Type extracted_details['propertyType'] = next((fact['factValue'] for fact in at_a_glance_facts if fact.get('factLabel') == 'Type'), None) # Year Built extracted_details['yearBuilt'] = next((fact['factValue'] for fact in at_a_glance_facts if fact.get('factLabel') == 'Year Built'), None) # Heating Types extracted_details['heating'] = next((fact['factValue'] for fact in at_a_glance_facts if fact.get('factLabel') == 'Heating'), None) # Cooling Types extracted_details['cooling'] = next((fact['factValue'] for fact in at_a_glance_facts if fact.get('factLabel') == 'Cooling'), None) # Parking Space Count parking_fact = next((fact['factValue'] for fact in at_a_glance_facts if fact.get('factLabel') == 'Parking'), None) extracted_details['parkingSpaceCount'] = parking_fact.split()[0] if parking_fact else None # Monthly HOA Fee extracted_details['monthlyHoaFee'] = next((fact['factValue'] for fact in at_a_glance_facts if fact.get('factLabel') == 'HOA'), None) # Lot Size extracted_details['lotSizeSquareFeet'] = next((fact['factValue'] for fact in at_a_glance_facts if fact.get('factLabel') == 'Lot'), None) # Days on Market (Days on Zillow) days_on_market_fact = next((fact['factValue'] for fact in at_a_glance_facts if fact.get('factLabel') == 'Days on Zillow'), None) extracted_details['daysOnMarket'] = days_on_market_fact.split()[0] if days_on_market_fact else None # Price per Square Foot extracted_details['pricePerSquareFoot'] = next((fact['factValue'] for fact in at_a_glance_facts if fact.get('factLabel') == 'Price/sqft'), None) return extracted_details except Exception as e: # logging.error(f"Unexpected error: {e}") print("error2"+ str(e)) print('******') return {"error2": str(e)} def saveToExcel(df): # Define file path print('try saving') try: # Define file path file_path = r"C:\Users\alice\OneDrive\Desktop\zillow\zillow_data.csv" existing_df = pd.read_csv(file_path) # Reorder df to match the existing CSV headers df = df.reindex(columns=existing_df.columns) # Append without headers if file exists df.to_csv(file_path, mode='a', index=False, header=False) print(f"Data saved successfully to {file_path}") except Exception as e: print(e) def nextpage(json_data): nextpage = json_data['props']['pageProps']['searchPageState']['cat1']['searchList']['pagination']['nextUrl'] next_url = 'https://www.zillow.com'+str( nextpage ) return next_url baseurl = "https://www.zillow.com/{}/" zipcode = '78258' url = baseurl.format(zipcode) print(url) response = grabdata(url, zipcode) json_data, listings = parse(response) next_url = nextpage(json_data) results = [] for listing in listings: first_data = getFirst(listing) url = listing['detailUrl'] details = getDetails(url) combined_data = {**first_data, **details} results.append(combined_data)