panosathans83 / NashvilleHousing---Data-Cleaning-in-SQL

0 stars 0 forks source link

NashvilleHousing #1

Open panosathans83 opened 1 year ago

panosathans83 commented 1 year ago

This is a Data Cleaning Project in SQL. The data source is an excel file containing 56477 rows of data. Steps followed:

  1. Standardized Date Format to get rid of the hour which was not needed.
  2. Populated Nulls in Property Address Data Depending on ParcelID.
  3. Split Property Address into Individual Columns (Address, City) Using SUBSTRING.
  4. Split Owner Address into Individual Columns (Address, City, State) Using PARSENAME.
  5. Changed Y and N to Yes and No in "Sold as Vacant" Column.
  6. Removed Duplicates.
  7. Deleted Unused Columns.
panosathans83 commented 1 year ago

-- Cleaning Data in SQL Queries

SELECT * FROM PortfolioProject.dbo.NashvilleHousing

-- Standardize Date Format (We don't need the hour)

SELECT SaleDateConverted, CONVERT(Date,SaleDate) FROM PortfolioProject.dbo.NashvilleHousing

UPDATE NashvilleHousing SET SaleDate = CONVERT(Date,SaleDate)

ALTER TABLE NashvilleHousing ADD SaleDateConverted Date;

UPDATE NashvilleHousing SET SaleDateConverted = CONVERT(Date,SaleDate)

-- Populate Nulls in Property Address Data Depending on ParcelID

SELECT * FROM PortfolioProject.dbo.NashvilleHousing WHERE PropertyAddress IS NULL ORDER BY ParcelID

SELECT a.ParcelID, a.PropertyAddress, b.ParcelID, b.PropertyAddress, ISNULL(a.PropertyAddress, b.PropertyAddress) FROM PortfolioProject.dbo.NashvilleHousing a JOIN PortfolioProject.dbo.NashvilleHousing b ON a.ParcelID = b.ParcelID AND a.[UniqueID ] <> b.[UniqueID ] WHERE a.PropertyAddress IS NULL

UPDATE a SET PropertyAddress = ISNULL(a.PropertyAddress, b.PropertyAddress) FROM PortfolioProject.dbo.NashvilleHousing a JOIN PortfolioProject.dbo.NashvilleHousing b ON a.ParcelID = b.ParcelID AND a.[UniqueID ] <> b.[UniqueID ] WHERE a.PropertyAddress IS NULL

-- Splitting Property Address Into Individual Columns (Address, City) Using SUBSTRING

SELECT PropertyAddress FROM PortfolioProject.dbo.NashvilleHousing

SELECT SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress) -1 ) AS Address, SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress) +1, LEN(PropertyAddress)) AS Address FROM PortfolioProject.dbo.NashvilleHousing

ALTER TABLE NashvilleHousing ADD PropertySplitAddress NVARCHAR(255);

UPDATE NashvilleHousing SET PropertySplitAddress = SUBSTRING(PropertyAddress, 1, CHARINDEX(',', PropertyAddress) -1 )

ALTER TABLE NashvilleHousing ADD PropertySplitCity NVARCHAR(255);

UPDATE NashvilleHousing SET PropertySplitCity = SUBSTRING(PropertyAddress, CHARINDEX(',', PropertyAddress) +1, LEN(PropertyAddress))

-- Splitting Owner Address Into Individual Columns (Address, City, State) Using PARSENAME

SELECT * FROM PortfolioProject.dbo.NashvilleHousing

SELECT OwnerAddress FROM PortfolioProject.dbo.NashvilleHousing

SELECT PARSENAME(REPLACE(OwnerAddress, ',', '.'), 3), PARSENAME(REPLACE(OwnerAddress, ',', '.'), 2), PARSENAME(REPLACE(OwnerAddress, ',', '.'), 1) FROM PortfolioProject.dbo.NashvilleHousing

ALTER TABLE NashvilleHousing ADD OwnerSplitAddress Nvarchar(255);

UPDATE NashvilleHousing SET OwnerSplitAddress = PARSENAME(REPLACE(OwnerAddress, ',', '.'), 3)

ALTER TABLE NashvilleHousing ADD OwnerSplitCity Nvarchar(255);

UPDATE NashvilleHousing SET OwnerSplitCity = PARSENAME(REPLACE(OwnerAddress, ',', '.'), 2)

ALTER TABLE NashvilleHousing ADD OwnerSplitState Nvarchar(255);

UPDATE NashvilleHousing SET OwnerSplitState = PARSENAME(REPLACE(OwnerAddress, ',', '.'), 1)

SELECT * FROM PortfolioProject.dbo.NashvilleHousing

-- Change Y and N to Yes and No in "Sold as Vacant" Column

SELECT DISTINCT (SoldAsVacant), COUNT(SoldAsVacant) FROM PortfolioProject.dbo.NashvilleHousing GROUP BY SoldAsVacant ORDER BY 2

SELECT SoldAsVacant, CASE WHEN SoldAsVacant = 'Y' THEN 'Yes' WHEN SoldAsVacant = 'N' THEN 'No' ELSE SoldAsVacant END FROM PortfolioProject.dbo.NashvilleHousing

UPDATE NashvilleHousing SET SoldAsVacant = CASE WHEN SoldAsVacant = 'Y' THEN 'Yes' WHEN SoldAsVacant = 'N' THEN 'No' ELSE SoldAsVacant END

-- Removing Duplicates

WITH RowNumCTE AS( SELECT *, ROW_NUMBER() OVER ( PARTITION BY ParcelID, PropertyAddress, SalePrice, SaleDate, LegalReference ORDER BY UniqueID ) row_num

FROM PortfolioProject.dbo.NashvilleHousing )

SELECT * FROM RowNumCTE WHERE row_num > 1 ORDER BY PropertyAddress

DELETE FROM RowNumCTE WHERE row_num > 1

SELECT * FROM RowNumCTE WHERE row_num > 1 ORDER BY PropertyAddress

-- Deleting Unused Columns

SELECT * FROM PortfolioProject.dbo.NashvilleHousing

ALTER TABLE PortfolioProject.dbo.NashvilleHousing DROP COLUMN OwnerAddress, TaxDistrict, PropertyAddress, Saledate