Project1Cleaning

Code to Obtain Data

Here is the code I used to obtain data on 400 houses for sale in Charlotte, NC. I included this code just in case because it was slightly different than what was presented in slack.

import os
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
import sys
import numpy as np
import pandas as pd
import regex as re
import requests
import lxml
from lxml.html.soupparser import fromstring
import prettify
import numbers
import htmltext

# set some display settings for notebooks
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# add headers in case you use chromedriver (captchas are no fun); namely used for chromedriver
req_headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'en-US,en;q=0.8',
    'upgrade-insecure-requests': '1',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
}

# create url variables for each zillow page
with requests.Session() as s:
    city = 'charlotte/'  # *****change this city to what you want!!!!*****

    url = 'https://www.zillow.com/homes/for_sale/' + city
    url2 = 'https://www.zillow.com/homes/for_sale/' + city + '/2_p/'
    url3 = 'https://www.zillow.com/homes/for_sale/' + city + '/3_p/'
    url4 = 'https://www.zillow.com/homes/for_sale/' + city + '/4_p/'
    url5 = 'https://www.zillow.com/homes/for_sale/' + city + '/5_p/'
    url6 = 'https://www.zillow.com/homes/for_sale/' + city + '/6_p/'
    url7 = 'https://www.zillow.com/homes/for_sale/' + city + '/7_p/'
    url8 = 'https://www.zillow.com/homes/for_sale/' + city + '/8_p/'
    url9 = 'https://www.zillow.com/homes/for_sale/' + city + '/9_p/'
    url10 = 'https://www.zillow.com/homes/for_sale/' + city + '/10_p/'

    r = s.get(url, headers=req_headers)
    r2 = s.get(url2, headers=req_headers)
    r3 = s.get(url3, headers=req_headers)
    r4 = s.get(url4, headers=req_headers)
    r5 = s.get(url5, headers=req_headers)
    r6 = s.get(url6, headers=req_headers)
    r7 = s.get(url7, headers=req_headers)
    r8 = s.get(url8, headers=req_headers)
    r9 = s.get(url9, headers=req_headers)
    r10 = s.get(url10, headers=req_headers)

    url_links = [url, url2, url3, url4, url5, url6, url7, url8, url9, url10]

# add contents of urls to soup variable from each url
soup = BeautifulSoup(r.content, 'html.parser')
soup1 = BeautifulSoup(r2.content, 'html.parser')
soup2 = BeautifulSoup(r3.content, 'html.parser')
soup3 = BeautifulSoup(r4.content, 'html.parser')
soup4 = BeautifulSoup(r5.content, 'html.parser')
soup5 = BeautifulSoup(r6.content, 'html.parser')
soup6 = BeautifulSoup(r7.content, 'html.parser')
soup7 = BeautifulSoup(r8.content, 'html.parser')
soup8 = BeautifulSoup(r9.content, 'html.parser')
soup9 = BeautifulSoup(r10.content, 'html.parser')

# page_links = [soup, soup1, soup2, soup3, soup4, soup5, soup6, soup7, soup8, soup9]

# create the first two dataframes
df = pd.DataFrame()
df1 = pd.DataFrame()

# all for loops are pulling the specified variable using beautiful soup and inserting into said variable
for i in soup:
    address = soup.find_all(class_='list-card-addr')
    price = list(soup.find_all(class_='list-card-price'))
    beds = list(soup.find_all("ul", class_="list-card-details"))
    details = soup.find_all('div', {'class': 'list-card-details'})
    home_type = soup.find_all('div', {'class': 'list-card-footer'})
    last_updated = soup.find_all('div', {'class': 'list-card-top'})
    brokerage = list(soup.find_all(class_='list-card-brokerage list-card-img-overlay', text=True))
    link = soup.find_all(class_='list-card-link')

    # create dataframe columns out of variables
    df['prices'] = price
    df['address'] = address
    df['beds'] = beds

# create empty url list
urls = []

# loop through url, pull the href and strip out the address tag
for link in soup.find_all("article"):
    href = link.find('a', class_="list-card-link")
    addresses = href.find('address')
    addresses.extract()
    urls.append(href)

# import urls into a links column
df['links'] = urls
df['links'] = df['links'].astype('str')

# remove html tags
df['links'] = df['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
df['links'] = df['links'].replace('" tabindex="0"></a>', ' ', regex=True)

for i in soup1:
    address1 = soup1.find_all(class_='list-card-addr')
    price1 = list(soup1.find_all(class_='list-card-price'))
    beds1 = list(soup.find_all("ul", class_="list-card-details"))
    details1 = soup1.find_all('div', {'class': 'list-card-details'})
    home_type1 = soup1.find_all('div', {'class': 'list-card-footer'})
    last_updated1 = soup1.find_all('div', {'class': 'list-card-top'})
    brokerage1 = list(soup1.find_all(class_='list-card-brokerage list-card-img-overlay', text=True))
    link1 = soup1.find_all(class_='list-card-link')

    # create dataframe columns out of variables
    df1['prices'] = price1
    df1['address'] = address1
    df1['beds'] = beds

# create empty url list
urls = []

# loop through url, pull the href and strip out the address tag
for link in soup1.find_all("article"):
    href = link.find('a', class_="list-card-link")
    addresses = href.find('address')
    addresses.extract()
    urls.append(href)

# import urls into a links column
df1['links'] = urls
df1['links'] = df1['links'].astype('str')

# remove html tags
df1['links'] = df1['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
df1['links'] = df1['links'].replace('" tabindex="0"></a>', ' ', regex=True)

# append first two dataframes
df = df.append(df1, ignore_index=True)

# create empty dataframes
df2 = pd.DataFrame()
df3 = pd.DataFrame()
df4 = pd.DataFrame()
df5 = pd.DataFrame()
df6 = pd.DataFrame()
df7 = pd.DataFrame()
df8 = pd.DataFrame()
df9 = pd.DataFrame()

for i in soup2:
    soup = soup2
    address = soup.find_all(class_='list-card-addr')
    price = list(soup.find_all(class_='list-card-price'))
    beds = list(soup.find_all("ul", class_="list-card-details"))
    details = soup.find_all('div', {'class': 'list-card-details'})
    home_type = soup.find_all('div', {'class': 'list-card-footer'})
    last_updated = soup.find_all('div', {'class': 'list-card-top'})
    brokerage = list(soup.find_all(class_='list-card-brokerage list-card-img-overlay', text=True))
    link = soup.find_all(class_='list-card-link')

    # create dataframe columns out of variables
    df2['prices'] = price
    df2['address'] = address
    df2['beds'] = beds

# create empty url list
urls = []

# loop through url, pull the href and strip out the address tag
for link in soup2.find_all("article"):
    href = link.find('a', class_="list-card-link")
    addresses = href.find('address')
    addresses.extract()
    urls.append(href)

# import urls into a links column
df2['links'] = urls
df2['links'] = df2['links'].astype('str')

# remove html tags
df2['links'] = df2['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
df2['links'] = df2['links'].replace('" tabindex="0"></a>', ' ', regex=True)

for i in soup3:
    soup = soup3
    address1 = soup.find_all(class_='list-card-addr')
    price1 = list(soup.find_all(class_='list-card-price'))
    beds1 = list(soup.find_all("ul", class_="list-card-details"))
    details1 = soup.find_all('div', {'class': 'list-card-details'})
    home_type1 = soup.find_all('div', {'class': 'list-card-footer'})
    last_updated1 = soup.find_all('div', {'class': 'list-card-top'})
    brokerage1 = list(soup.find_all(class_='list-card-brokerage list-card-img-overlay', text=True))
    link1 = soup.find_all(class_='list-card-link')

    # create dataframe columns out of variables
    df3['prices'] = price1
    df3['address'] = address1
    df3['beds'] = beds

# create empty url list
urls = []

# loop through url, pull the href and strip out the address tag
for link in soup3.find_all("article"):
    href = link.find('a', class_="list-card-link")
    addresses = href.find('address')
    addresses.extract()
    urls.append(href)

# import urls into a links column
df3['links'] = urls
df3['links'] = df3['links'].astype('str')

# remove html tags
df3['links'] = df3['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
df3['links'] = df3['links'].replace('" tabindex="0"></a>', ' ', regex=True)

for i in soup4:
    soup = soup4
    address1 = soup.find_all(class_='list-card-addr')
    price1 = list(soup.find_all(class_='list-card-price'))
    beds1 = list(soup.find_all("ul", class_="list-card-details"))
    details1 = soup.find_all('div', {'class': 'list-card-details'})
    home_type1 = soup.find_all('div', {'class': 'list-card-footer'})
    last_updated1 = soup.find_all('div', {'class': 'list-card-top'})
    brokerage1 = list(soup.find_all(class_='list-card-brokerage list-card-img-overlay', text=True))
    link1 = soup.find_all(class_='list-card-link')

    # create dataframe columns out of variables
    df4['prices'] = price1
    df4['address'] = address1
    df4['beds'] = beds

# create empty url list
urls = []

# loop through url, pull the href and strip out the address tag
for link in soup4.find_all("article"):
    href = link.find('a', class_="list-card-link")
    addresses = href.find('address')
    addresses.extract()
    urls.append(href)

# import urls into a links column
df4['links'] = urls
df4['links'] = df4['links'].astype('str')

# remove html tags
df4['links'] = df4['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
df4['links'] = df4['links'].replace('" tabindex="0"></a>', ' ', regex=True)

for i in soup5:
    soup = soup5
    address1 = soup.find_all(class_='list-card-addr')
    price1 = list(soup.find_all(class_='list-card-price'))
    beds1 = list(soup.find_all("ul", class_="list-card-details"))
    details1 = soup.find_all('div', {'class': 'list-card-details'})
    home_type1 = soup.find_all('div', {'class': 'list-card-footer'})
    last_updated1 = soup.find_all('div', {'class': 'list-card-top'})
    brokerage1 = list(soup.find_all(class_='list-card-brokerage list-card-img-overlay', text=True))
    link1 = soup.find_all(class_='list-card-link')

    # create dataframe columns out of variables
    df5['prices'] = price1
    df5['address'] = address1
    df5['beds'] = beds

# create empty url list
urls = []

# loop through url, pull the href and strip out the address tag
for link in soup5.find_all("article"):
    href = link.find('a', class_="list-card-link")
    addresses = href.find('address')
    addresses.extract()
    urls.append(href)

# import urls into a links column
df5['links'] = urls
df5['links'] = df5['links'].astype('str')

# remove html tags
df5['links'] = df5['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
df5['links'] = df5['links'].replace('" tabindex="0"></a>', ' ', regex=True)

for i in soup6:
    soup = soup6
    address1 = soup.find_all(class_='list-card-addr')
    price1 = list(soup.find_all(class_='list-card-price'))
    beds1 = list(soup.find_all("ul", class_="list-card-details"))
    details1 = soup.find_all('div', {'class': 'list-card-details'})
    home_type1 = soup.find_all('div', {'class': 'list-card-footer'})
    last_updated1 = soup.find_all('div', {'class': 'list-card-top'})
    brokerage1 = list(soup.find_all(class_='list-card-brokerage list-card-img-overlay', text=True))
    link1 = soup.find_all(class_='list-card-link')

    # create dataframe columns out of variables
    df6['prices'] = price1
    df6['address'] = address1
    df6['beds'] = beds

# create empty url list
urls = []

# loop through url, pull the href and strip out the address tag
for link in soup6.find_all("article"):
    href = link.find('a', class_="list-card-link")
    addresses = href.find('address')
    addresses.extract()
    urls.append(href)

# import urls into a links column
df6['links'] = urls
df6['links'] = df6['links'].astype('str')

# remove html tags
df6['links'] = df6['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
df6['links'] = df6['links'].replace('" tabindex="0"></a>', ' ', regex=True)

for i in soup7:
    soup = soup7
    address1 = soup.find_all(class_='list-card-addr')
    price1 = list(soup.find_all(class_='list-card-price'))
    beds1 = list(soup.find_all("ul", class_="list-card-details"))
    details1 = soup.find_all('div', {'class': 'list-card-details'})
    home_type1 = soup.find_all('div', {'class': 'list-card-footer'})
    last_updated1 = soup.find_all('div', {'class': 'list-card-top'})
    brokerage1 = list(soup.find_all(class_='list-card-brokerage list-card-img-overlay', text=True))
    link1 = soup.find_all(class_='list-card-link')

    # create dataframe columns out of variables
    df7['prices'] = price1
    df7['address'] = address1
    df7['beds'] = beds

# create empty url list
urls = []

# loop through url, pull the href and strip out the address tag
for link in soup7.find_all("article"):
    href = link.find('a', class_="list-card-link")
    addresses = href.find('address')
    addresses.extract()
    urls.append(href)

# import urls into a links column
df7['links'] = urls
df7['links'] = df7['links'].astype('str')

# remove html tags
df7['links'] = df7['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
df7['links'] = df7['links'].replace('" tabindex="0"></a>', ' ', regex=True)

for i in soup8:
    soup = soup8
    address1 = soup.find_all(class_='list-card-addr')
    price1 = list(soup.find_all(class_='list-card-price'))
    beds1 = list(soup.find_all("ul", class_="list-card-details"))
    details1 = soup.find_all('div', {'class': 'list-card-details'})
    home_type1 = soup.find_all('div', {'class': 'list-card-footer'})
    last_updated1 = soup.find_all('div', {'class': 'list-card-top'})
    brokerage1 = list(soup.find_all(class_='list-card-brokerage list-card-img-overlay', text=True))
    link1 = soup.find_all(class_='list-card-link')

    # create dataframe columns out of variables
    df8['prices'] = price1
    df8['address'] = address1
    df8['beds'] = beds

# create empty url list
urls = []

# loop through url, pull the href and strip out the address tag
for link in soup8.find_all("article"):
    href = link.find('a', class_="list-card-link")
    addresses = href.find('address')
    addresses.extract()
    urls.append(href)

# import urls into a links column
df8['links'] = urls
df8['links'] = df8['links'].astype('str')

# remove html tags
df8['links'] = df8['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
df8['links'] = df8['links'].replace('" tabindex="0"></a>', ' ', regex=True)

for i in soup9:
    soup = soup9
    address1 = soup.find_all(class_='list-card-addr')
    price1 = list(soup.find_all(class_='list-card-price'))
    beds1 = list(soup.find_all("ul", class_="list-card-details"))
    details1 = soup.find_all('div', {'class': 'list-card-details'})
    home_type1 = soup.find_all('div', {'class': 'list-card-footer'})
    last_updated1 = soup.find_all('div', {'class': 'list-card-top'})
    brokerage1 = list(soup.find_all(class_='list-card-brokerage list-card-img-overlay', text=True))
    link1 = soup.find_all(class_='list-card-link')

    # create dataframe columns out of variables
    df9['prices'] = price1
    df9['address'] = address1
    df9['beds'] = beds

# create empty url list
urls = []

# loop through url, pull the href and strip out the address tag
for link in soup9.find_all("article"):
    href = link.find('a', class_="list-card-link")
    addresses = href.find('address')
    addresses.extract()
    urls.append(href)

# import urls into a links column
df9['links'] = urls
df9['links'] = df9['links'].astype('str')

# remove html tags
df9['links'] = df9['links'].replace('<a class="list-card-link" href="', ' ', regex=True)
df9['links'] = df9['links'].replace('" tabindex="0"></a>', ' ', regex=True)

df = df.append(df2, ignore_index=True)
df = df.append(df3, ignore_index=True)
df = df.append(df4, ignore_index=True)
df = df.append(df5, ignore_index=True)
df = df.append(df6, ignore_index=True)
df = df.append(df7, ignore_index=True)
df = df.append(df8, ignore_index=True)
df = df.append(df9, ignore_index=True)

# convert columns to str
df['prices'] = df['prices'].astype('str')
df['address'] = df['address'].astype('str')
df['beds'] = df['beds'].astype('str')


# remove html tags

df['prices'] = df['prices'].replace('<div class="list-card-price">', ' ', regex=True)
df['address'] = df['address'].replace('<address class="list-card-addr">', ' ', regex=True)
df['prices'] = df['prices'].replace('</div>', ' ', regex = True)
df['address'] = df['address'].replace('</address>', ' ', regex=True)
df['prices'] = df['prices'].str.replace(r'\D', '')

price = []
for i in df['prices']:
  new_i = i[0:int(len(i)/2)]
  price.append(new_i) 
df[['price']] = price


# remove html tags from beds column
df['beds'] = df.beds.replace('<ul class="list-card-details"><li class="">', ' ', regex=True)
df['beds'] = df['beds'].replace('<abbr class="list-card-label"> <!-- -->bds</abbr></li><li class="">', ' ', regex=True)
df['beds'] = df['beds'].replace('<abbr class="list-card-label"> <!-- -->bd</abbr></li><li class="">', ' ', regex=True)
df['beds'] = df['beds'].replace('<abbr class="list-card-label"> <!-- -->ba</abbr></li><li class="">', ' ', regex=True)
df['beds'] = df['beds'].replace('<abbr class="list-card-label"> <!-- -->sqft</abbr></li><li class="list-card-statusText">', ' ', regex=True)

df[['no_beds']] = df.beds.str.slice(start = 1, stop = 2)
df[['baths']] = df.beds.str.slice(start = 3, stop = 4)
df[['sqft']] = df.beds.str.slice(start = 5, stop = 10)

# remove commas from sq_feet and convert to float
df['sqft'].replace('-','',regex = True, inplace = True)
df['sqft'] = df['sqft'].replace(',','', regex=True)

# drop nulls
df = df[(df['price'] != '') & (df['price'] != ' ')]

# convert column to float
df['price'] = df['price'].astype('float')
df['sqft'] = df['sqft'].astype('float')

# remove spaces from link column
df['links'] = df.links.str.replace(' ', '')

# rearrange the columns
df1 = df[['price', 'address', 'no_beds', 'baths', 'sqft']]

df1.to_csv('out.csv', index=False)