Basic Zillow Scraper

This is a basic Zillow scraper built with Python using just the Requests library. Hopefully, you find it somewhat helpful.

To scrape the complete data, I first scrape the Zillow page with all the listings. Then, I scrape individual listings to gather additional information. The tricky part for me is that not all the information is located in the same part of the JSON.

# -*- coding: utf-8 -*-
"""
Created on Mon Nov 11 13:28:07 2024

@author: alice
"""

import requests
from bs4 import BeautifulSoup
import json
import time
# import detailParser
from lxml import html
import random
import os
import argparse


######################################################################################

def getProxies():
    
    base_proxy = "usa.rotating.proxyrack.net"
    # Generate a list of proxies for the range 10000 to 10099
    stickyProxies = [f"{base_proxy}:{port}" for port in range(9001, 9051)]

    # Randomly select a proxy from the list
    # proxy = "usa.rotating.proxyrack.net:9000"
    proxy = random.choice(stickyProxies)

    proxies = {
        'http': proxy,
        'https': proxy,
    }
    return proxies
# Extract property details


def detailDataGrab(url):
    for i in range(10):
       
        proxies = getProxies()
        headers = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
            "accept-encoding": "gzip, deflate, br, zstd",
            "accept-language": "en-US,en;q=0.9",
            "cache-control": "max-age=0",
            "priority": "u=0, i",
            "referer": url,
            "sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": '"Windows"',
            "sec-fetch-dest": "document",
            "sec-fetch-mode": "navigate",
            "sec-fetch-site": "same-origin",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
        }
        
        # Make the GET request
        response = requests.get(url, headers=headers, proxies=proxies)
        status = response.status_code
        print(status)
        if status!=403:
            return response
        else:
            # print('try again')
            pass

    
def getDetails(url):
    # url = 'https://www.zillow.com/homedetails/1415-Nicholas-Mnr-San-Antonio-TX-78258/83984537_zpid/'
    response = detailDataGrab(url)
    details = extract_property_details(response.text)
    # print(details)
    # result = json.dumps(details, indent=4)
    return details

######################################################################################
#########################################################################################



def grabdata(url, zipcode):
    
    # Define the headers
    for i in range(10):
        time.sleep(0.1)
        proxies = getProxies()
        headers = {
            "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
            "accept-encoding": "gzip, deflate, br, zstd",
            "accept-language": "en-US,en;q=0.9",
            "cache-control": "max-age=0",
            "priority": "u=0, i",
            "referer": "https://www.zillow.com/homes/for_sale/{}_rb/".format(zipcode),
            "sec-ch-ua": '"Chromium";v="130", "Google Chrome";v="130", "Not?A_Brand";v="99"',
            "sec-ch-ua-mobile": "?0",
            "sec-ch-ua-platform": '"Windows"',
            "sec-fetch-dest": "document",
            "sec-fetch-mode": "navigate",
            "sec-fetch-site": "same-origin",
            "sec-fetch-user": "?1",
            "upgrade-insecure-requests": "1",
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
        }
        
        # Make the GET request
        response = requests.get(url, headers=headers, proxies=proxies)
        status = response.status_code
        print(status)
        if status!=403:
            return response
        else:
            pass
            # print('try again')




# file_path = "script_content.txt"

# # Open the file in write mode and save the content
# with open(file_path, "w", encoding="utf-8") as file:
#     file.write(script_content)
# print(f"Script content has been saved to {file_path}")

def parse(response):
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find the script tag with id "__NEXT_DATA__"
    script_tag = soup.find('script', {'id': '__NEXT_DATA__'})
    
    # Extract the text content of the script tag
    if script_tag:
        script_content = script_tag.string
        # Load the content as JSON
        json_data = json.loads(script_content)
    else:
        print("Script tag with id '__NEXT_DATA__' not found.")
        return None, []

    try:
        listings = json_data['props']['pageProps']['searchPageState']['cat1']['searchResults']['listResults']
        print(f"Found {len(listings)} listings on the page.")
    except KeyError as e:
        print(f"Error parsing listings: {e}")
        return None, []
    return json_data, listings


############################################################################################

def nextpage(json_data):
    nextpage = json_data['props']['pageProps']['searchPageState']['cat1']['searchList']['pagination']
    return nextpage

def getFirst(listing):
      property_data = {
        'statusType': listing.get('statusType', None),
        'street': listing.get('addressStreet', None),
        'city': listing.get('addressCity', None),
        'state': listing.get('addressState', None),
        'zip': listing.get('addressZipcode', None),
        'bathroomCount': listing.get('baths', None),
        'bedroomCount': listing.get('beds', None),
        'livingArea': listing.get('area', None),
        'latitude': listing.get('latLong', {}).get('latitude', None),
        'longitude': listing.get('latLong', {}).get('longitude', None),
        'price': listing.get('price', None),
        'status': listing.get('statusType', None),
        'listingKey': listing.get('id', None),
        'imageUrl': listing.get('imgSrc', None),
        'brokerageName': listing.get('brokerName', None),
        'marketingStatusSimplified': listing.get('marketingStatusSimplifiedCd', None),
        'rawHomeStatus': listing.get('rawHomeStatusCd', None),
        'detailUrl': listing.get('detailUrl', None),
        'countryCurrency': listing.get('countryCurrency', None),
        'statusText': listing.get('statusText', None),
        'isUndisclosedAddress': listing.get('isUndisclosedAddress', None),
        'isZillowOwned': listing.get('isZillowOwned', None)
     }
      return property_data



def saveToExcel(df):
 
    file_path = r"C:\Users\alice\OneDrive\Desktop\zillow_basic.csv"

    existing_df = pd.read_csv(file_path)
    # Reorder df to match the existing CSV headers
    df = df.reindex(columns=existing_df.columns)
    # Append without headers if file exists
    df.to_csv(file_path, mode='a', index=False, header=False)



def extract_property_details(html_content):
    # Parse the HTML content
    tree = html.fromstring(html_content)

    # Extract the JSON data from the script tag with id "__NEXT_DATA__"
    script_content = tree.xpath('//script[@id="__NEXT_DATA__"]/text()')
    if not script_content:
        return {"error": "No property data found"}
    
    try:
        # Parse the JSON content
        data = json.loads(script_content[0])

        # Navigate to the nested data structure
        gdp_client_cache = data.get("props", {}).get("pageProps", {}).get("componentProps", {}).get("gdpClientCache", {})
        # if not gdp_client_cache:
        #     return {"error": "gdpClientCache not found"}
        
        # Parse gdpClientCache if it's JSON-serializable
        if isinstance(gdp_client_cache, str):
            gdp_client_cache = json.loads(gdp_client_cache)

        # Extract property data
        data = gdp_client_cache
        
        

               
        extracted_details = {}
        key = next(iter(data))
        data = data[key]
        

       
        # Property Details
        extracted_details['county'] = data['property'].get('county')
        extracted_details['countyFIPS'] = data['property'].get('countyFIPS')
        extracted_details['zipPlus4'] = data['property'].get('zipPlus4', '')
        
        # MLS Details
        extracted_details['appliances'] = data['property']['resoFacts'].get('appliances', [])
        extracted_details['architecturalStyle'] = data['property']['resoFacts'].get('architecturalStyle', '')
        extracted_details['buildingStyle'] = data['property'].get('homeType', '')
        extracted_details['condoFloorNumber'] = data['property'].get('condoFloorNumber', 0)
        extracted_details['coolingTypes'] = data['property']['resoFacts'].get('cooling', [])
        extracted_details['description'] = data['property'].get('description', '')
        extracted_details['directions'] = data['property'].get('directions', '')
        extracted_details['exteriorConstruction'] = data['property']['resoFacts'].get('exteriorFeatures', [])
        extracted_details['failedListingDate'] = data['property'].get('failedListingDate', '')
        extracted_details['floorCount'] = data['property']['resoFacts'].get('stories', 0)
        extracted_details['fullBathroomCount'] = data['property']['resoFacts'].get('bathroomsFull', 0)
        extracted_details['halfBathroomCount'] = data['property']['resoFacts'].get('bathroomsHalf', 0)
                
                
        # Handling interiorFeatures safely
        try:
            interior_features = data.get('property', {}).get('resoFacts', {}).get('interiorFeatures', [])
            if isinstance(interior_features, list):
                extracted_details['hasCeilingFan'] = any('ceiling fan' in feature.lower() for feature in interior_features if isinstance(feature, str))
                extracted_details['hasVaultedCeiling'] = any('vaulted ceiling' in feature.lower() for feature in interior_features if isinstance(feature, str))
            else:
                extracted_details['hasCeilingFan'] = False
                extracted_details['hasVaultedCeiling'] = False
        except Exception:
            extracted_details['hasCeilingFan'] = False
            extracted_details['hasVaultedCeiling'] = False

        
      
        extracted_details['heating'] = data['property']['resoFacts'].get('heating', [])
        extracted_details['initialListingStatus'] = data['property'].get('initialListingStatus', '')
        extracted_details['listingCategory'] = data['property'].get('listingCategory', '')
        extracted_details['lotSizeSquareFeet'] = data['property']['resoFacts'].get('lotSize', '')
        extracted_details['maxListPriceDate'] = data['property'].get('maxListPriceDate', '')
        extracted_details['minListPrice'] = data['property'].get('minListPrice', 0)
        extracted_details['minListPriceDate'] = data['property'].get('minListPriceDate', '')
        extracted_details['mlsId'] = data['property']['attributionInfo'].get('mlsId', '')
        extracted_details['mlsName'] = data['property']['attributionInfo'].get('mlsName', '')
        extracted_details['mlsNumber'] = data['property']['resoFacts'].get('listingId', '')
        extracted_details['neighborhood'] = data['property'].get('neighborhoodRegion', '')
        extracted_details['newConstruction'] = data['property'].get('newConstructionType', False)
        extracted_details['oneQuarterBathroomCount'] = data['property']['resoFacts'].get('bathroomsOneQuarter', 0)
        extracted_details['originalListingDate'] = data['property']['resoFacts'].get('onMarketDate', '')
        extracted_details['parkingSpaceCount'] = data['property']['resoFacts'].get('parkingCapacity', 0)
        extracted_details['partialBathroomCount'] = data['property']['resoFacts'].get('bathroomsPartial', 0)

        
        
        extracted_details['patioAndPorchFeatures'] = data['property']['resoFacts'].get('patioAndPorchFeatures', [])
        extracted_details['price'] = data['property'].get('zestimate', 0)
        extracted_details['daysOnMarket'] = data['property']['resoFacts'].get('cumulativeDaysOnMarket', 0)
        extracted_details['propertySubtype'] = data['property']['resoFacts'].get('propertySubType', '')
        extracted_details['propertyType'] = data['property'].get('homeType', '')
        extracted_details['rentalIndicator'] = data['property'].get('postingIsRental', False)
        extracted_details['rental'] = data['property'].get('postingIsRental', False)  # Retained for clarity, but can be removed if redundant
        extracted_details['roofType'] = data['property']['resoFacts'].get('roofType', '')
        extracted_details['soldDate'] = data['property'].get('dateSold', '')
        extracted_details['soldPrice'] = data['property'].get('lastSoldPrice', 0)
        extracted_details['statusSubtype'] = data['property'].get('statusSubtype', '')
        extracted_details['subDivision'] = data['property'].get('subdivisionName', '')
        extracted_details['title'] = data['property'].get('title', '')
        extracted_details['totalBuildingAreaSquareFeet'] = data['property']['resoFacts'].get('buildingArea', 0)
        extracted_details['yearBuilt'] = data['property'].get('yearBuilt', 0)
        extracted_details['salePriceIsEstimated'] = 'zestimate' in data['property'].get('adTargets', '')
        extracted_details['realtorId'] = data['property'].get('realtorId', '')
        extracted_details['copyright'] = data['property'].get('copyright', '')

        
        # Brokerage Details
        # extracted_details['mls_brokerage_name'] = data['property']['attributionInfo'].get('brokerName', '')
        extracted_details['mls_brokerage_phoneNumber'] = data['property']['attributionInfo'].get('brokerPhoneNumber', '')
        extracted_details['mls_brokerage_address'] = data['property']['resoFacts'].get('associationName', '')
        extracted_details['mls_brokerage_email'] = data['property']['attributionInfo'].get('brokerEmail', '')
        extracted_details['mls_brokerage_websiteUrl'] = data['property']['attributionInfo'].get('brokerWebsiteUrl', '')
        
        
        
        property_data = data.get('property', {})
        reso_facts = property_data.get('resoFacts', {})
        attribution_info = property_data.get('attributionInfo', {})



        listing_agents = attribution_info.get('listingAgents', [])
        if isinstance(listing_agents, list):
            if len(listing_agents) > 0:
                extracted_details['mls_listingAgents_name_1'] = listing_agents[0].get('memberFullName', '')
                extracted_details['mls_listingAgents_role_1'] = listing_agents[0].get('associatedAgentType', '')
                extracted_details['mls_listingAgents_primaryPhoneNumber_1'] = listing_agents[0].get('agentPhoneNumber', '')
                extracted_details['mls_listingAgents_email_1'] = listing_agents[0].get('agentEmail', '')
            else:
                extracted_details['mls_listingAgents_name_1'] = None
                extracted_details['mls_listingAgents_role_1'] = None
                extracted_details['mls_listingAgents_primaryPhoneNumber_1'] = None
                extracted_details['mls_listingAgents_email_1'] = None
        
        # School Details
        schools = property_data.get('schools', [])
        if isinstance(schools, list):
            if len(schools) > 0:
                extracted_details['mls_schools_name_1'] = schools[0].get('name', '')
                extracted_details['mls_schools_category_1'] = schools[0].get('category', '')
                extracted_details['mls_schools_district_1'] = schools[0].get('district', '')
            if len(schools) > 1:
                extracted_details['mls_schools_name_2'] = schools[1].get('name', '')
                extracted_details['mls_schools_category_2'] = schools[1].get('category', '')
                extracted_details['mls_schools_district_2'] = schools[1].get('district', '')
            if len(schools) > 2:
                extracted_details['mls_schools_name_3'] = schools[2].get('name', '')
                extracted_details['mls_schools_category_3'] = schools[2].get('category', '')
                extracted_details['mls_schools_district_3'] = schools[2].get('district', '')

        
        # Safe iteration for tax info
        tax_info = property_data.get('taxInfo', [])
        if isinstance(tax_info, list):
            if len(tax_info) > 0:
                extracted_details['mls_taxes_year_1'] = tax_info[0].get('year', '')
                extracted_details['mls_taxes_amount_1'] = tax_info[0].get('amount', 0)
                extracted_details['mls_taxes_description_1'] = tax_info[0].get('description', '')
            if len(tax_info) > 1:
                extracted_details['mls_taxes_year_2'] = tax_info[1].get('year', '')
                extracted_details['mls_taxes_amount_2'] = tax_info[1].get('amount', 0)
                extracted_details['mls_taxes_description_2'] = tax_info[1].get('description', '')
        
                    
            
            
        extracted_details['monthlyHoaFee'] = data['property']['resoFacts'].get('hoaFee', '')
        extracted_details['propertyTaxRate'] = data['property'].get('taxRate', 0)
        extracted_details['parcelId'] = data['property'].get('parcelId', '')
        extracted_details['parcel_number'] = data['property'].get('parcel_number', '')
        
        # Image and Media Details
        # extracted_details['image_url'] = data['property'].get('imageUrl', '')
        extracted_details['image_url_uncropped'] = data['property'].get('imageUrlUncropped', '')
        
        # Tour and Contact Details
        extracted_details['mls_tourRequestTitle'] = data['property'].get('tourRequestTitle', '')
        extracted_details['mls_tourNextAvailableTime'] = data['property'].get('tourNextAvailableTime', '')
        extracted_details['mls_contactFormTitle'] = data['property'].get('contactFormTitle', '')
        
       
        # Safely process 'atAGlanceFacts'
        at_a_glance_facts = reso_facts.get('atAGlanceFacts', [])
        if isinstance(at_a_glance_facts, list):
          
            # Property Type
            extracted_details['propertyType'] = next((fact['factValue'] for fact in at_a_glance_facts if fact.get('factLabel') == 'Type'), None)
            
            # Year Built
            extracted_details['yearBuilt'] = next((fact['factValue'] for fact in at_a_glance_facts if fact.get('factLabel') == 'Year Built'), None)
            
            # Heating Types
            extracted_details['heating'] = next((fact['factValue'] for fact in at_a_glance_facts if fact.get('factLabel') == 'Heating'), None)
            
            # Cooling Types
            extracted_details['cooling'] = next((fact['factValue'] for fact in at_a_glance_facts if fact.get('factLabel') == 'Cooling'), None)
            
            # Parking Space Count
            parking_fact = next((fact['factValue'] for fact in at_a_glance_facts if fact.get('factLabel') == 'Parking'), None)
            extracted_details['parkingSpaceCount'] = parking_fact.split()[0] if parking_fact else None
            
            # Monthly HOA Fee
            extracted_details['monthlyHoaFee'] = next((fact['factValue'] for fact in at_a_glance_facts if fact.get('factLabel') == 'HOA'), None)
            
            # Lot Size
            extracted_details['lotSizeSquareFeet'] = next((fact['factValue'] for fact in at_a_glance_facts if fact.get('factLabel') == 'Lot'), None)
            
            # Days on Market (Days on Zillow)
            days_on_market_fact = next((fact['factValue'] for fact in at_a_glance_facts if fact.get('factLabel') == 'Days on Zillow'), None)
            extracted_details['daysOnMarket'] = days_on_market_fact.split()[0] if days_on_market_fact else None
            
            # Price per Square Foot
            extracted_details['pricePerSquareFoot'] = next((fact['factValue'] for fact in at_a_glance_facts if fact.get('factLabel') == 'Price/sqft'), None)

            
        return  extracted_details
    except Exception as e:
         # logging.error(f"Unexpected error: {e}")
         print("error2"+ str(e))
         print('******')
         return {"error2": str(e)}
        
    
        
def saveToExcel(df):
    # Define file path
    print('try saving')
    try:
        # Define file path
        
        file_path = r"C:\Users\alice\OneDrive\Desktop\zillow\zillow_data.csv"
    
        existing_df = pd.read_csv(file_path)
        # Reorder df to match the existing CSV headers
        df = df.reindex(columns=existing_df.columns)
        # Append without headers if file exists
        df.to_csv(file_path, mode='a', index=False, header=False)
        print(f"Data saved successfully to {file_path}")
    
    except Exception as e:
        print(e)


def nextpage(json_data):
    nextpage = json_data['props']['pageProps']['searchPageState']['cat1']['searchList']['pagination']['nextUrl']
    next_url = 'https://www.zillow.com'+str( nextpage )
    return next_url


baseurl = "https://www.zillow.com/{}/"


zipcode = '78258'
url = baseurl.format(zipcode)
print(url)



response = grabdata(url, zipcode)

json_data, listings = parse(response)


next_url = nextpage(json_data)

results = []

for listing in listings:
    first_data = getFirst(listing)
    
    url = listing['detailUrl']
    details = getDetails(url)
    combined_data = {**first_data, **details}
    results.append(combined_data)



Previous
Previous

Building a Scalable ETL Pipeline | Data Engineering Project & Key Takeaways