DevCEDTeam / CED

0 stars 0 forks source link

Sample Dataset #142

Open DevCEDTeam opened 5 hours ago

DevCEDTeam commented 5 hours ago

{"cells":[{"cell_type":"markdown","source":["\n","\n","\n","# This is formatted as code\n","\n","\n","#Step # 1: Mounted at /content/drive"],"metadata":{"id":"-x2p5VSe3NAJ"}},{"cell_type":"code","source":["from google.colab import drive\n","drive.mount('/content/drive')"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"mm6TA0397jcY","executionInfo":{"status":"ok","timestamp":1697906649794,"user_tz":420,"elapsed":18151,"user":{"displayName":"Council Education","userId":"02038367057430065145"}},"outputId":"8e56a9ec-c0f8-425a-8dc1-78e95e900a43"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount(\"/content/drive\", force_remount=True).\n"]}]},{"cell_type":"markdown","source":["#Step #2: Change director (\"cd\") | path content/drive/MyDrive/projects/dataset/oc-voter-list/master/"],"metadata":{"id":"8xRNs3233eNm"}},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":710,"status":"ok","timestamp":1697143985696,"user":{"displayName":"Council Education","userId":"02038367057430065145"},"user_tz":420},"id":"cdCl68eJmXws","outputId":"b1d45c09-84f8-45fa-ce9c-89ece2f7a65b"},"outputs":[{"output_type":"stream","name":"stdout","text":["/content/drive/MyDrive/projects/dataset/oc-voter-list/master\n"]}],"source":["cd /content/drive/MyDrive/projects/dataset/oc-voter-list/master/"]},{"cell_type":"markdown","source":["#Step # 3: ls: Lists the files and directories in the current working directory"],"metadata":{"id":"yUpRK7QM3i7L"}},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":617,"status":"ok","timestamp":1697143993742,"user":{"displayName":"Council Education","userId":"02038367057430065145"},"user_tz":420},"id":"gb-MrDo_mj6n","outputId":"30bde901-7656-4843-9c33-0da7b4eed494"},"outputs":[{"output_type":"stream","name":"stdout","text":[" cntywd_02102023.csv cntywd_02102023.xlsx\t cntywd_Hist_021023.txt\n"," cntywd02102023.txt cntywd_Hist_021023.csv\t'RVF Layout.pdf'\n"]}],"source":["!ls"]},{"cell_type":"markdown","metadata":{"id":"SybZsln8mJLT"},"source":["#Step #4: Print the top Row Header"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":1022,"status":"ok","timestamp":1697144006633,"user":{"displayName":"Council Education","userId":"02038367057430065145"},"user_tz":420},"id":"nL0vdDVamInv","outputId":"3a5b6a62-475e-4317-dc93-46de067de251"},"outputs":[{"output_type":"stream","name":"stdout","text":["['lVoterUniqueID', 'sAffNumber', 'szStateVoterID', 'sVoterTitle', 'szNameLast', 'szNameFirst', 'szNameMiddle', 'sNameSuffix', 'sGender', 'szSitusAddress', 'szSitusCity', 'sSitusState', 'sSitusZip', 'sHouseNum', 'sUnitAbbr', 'sUnitNum', 'szStreetName', 'sStreetSuffix', 'sPreDir', 'sPostDir', 'szMailAddress1', 'szMailAddress2', 'szMailAddress3', 'szMailAddress4', 'szMailZip', 'szPhone', 'szEmailAddress', 'dtBirthDate', 'sBirthPlace', 'dtRegDate', 'dtOrigRegDate', 'dtLastUpdate_dt', 'sStatusCode', 'szStatusReasonDesc', 'sUserCode1', 'sUserCode2', 'iDuplicateIDFlag', 'szLanguageName', 'szPartyName', 'szAVStatusAbbr', 'szAVStatusDesc', 'szPrecinctName', 'sPrecinctID', 'sPrecinctPortion', 'sDistrictID_0', 'iSubDistrict_0', 'szDistrictName_0', 'sDistrictID_1', 'iSubDistrict_1', 'szDistrictName_1', 'sDistrictID_2', 'iSubDistrict_2', 'szDistrictName_2', 'sDistrictID_3', 'iSubDistrict_3', 'szDistrictName_3', 'sDistrictID_4', 'iSubDistrict_4', 'szDistrictName_4', 'sDistrictID_5', 'iSubDistrict_5', 'szDistrictName_5']\n"]}],"source":["# To Using Pandas write a script to convert a .txt file with space-tab (t) separator and ANSI encoding into a CSV file using a Pandas snippet in Google Colab.\n","\n","# You can use the 'ISO-8859-1' encoding instead. Here's the corrected code:\n","\n","\n","import pandas as pd\n","\n","# Define the input .txt file path\n","input_txt_file = '/content/drive/MyDrive/projects/dataset/oc-voter-list/master/cntywd02102023.txt'\n","\n","# Read the first row from the .txt file (assuming the header is in the first row)\n","with open(input_txt_file, 'r', encoding='ISO-8859-1') as file:\n"," # Read the first line\n"," first_line = file.readline()\n","\n","# Split the first line using space and tab (t) as the separator\n","header = first_line.strip().split('\t')\n","\n","# Print the top row header\n","print(header)\n","\n"]},{"cell_type":"markdown","metadata":{"id":"Xz2aigrqsRPn"},"source":["#Step #5: Print top 500 Rows"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":36790,"status":"ok","timestamp":1697144051048,"user":{"displayName":"Council Education","userId":"02038367057430065145"},"user_tz":420},"id":"8I3G-nAAsQxM","outputId":"787317eb-c4cd-4384-808a-55cec4893be5"},"outputs":[{"output_type":"stream","name":"stderr","text":[":20: DtypeWarning: Columns (13,18,19) have mixed types. Specify dtype option on import or set low_memory=False.\n"," df = pd.read_csv(input_txt_file, sep='\t', encoding='ISO-8859-1', dtype=dtypes)\n"]},{"output_type":"stream","name":"stdout","text":[" szEmailAddress szNameFirst szNameLast\n","0 NaN Ruth Rodriguez\n","1 brianschoonover@yahoo.com Brian Schoonover\n","2 brianschoonover@yahoo.com Julie Schoonover\n","3 ROSEMARYAVELLANEDA58@GMAIL.COM Rosemary Avellaneda\n","4 NaN Tony Garduno\n",".. ... ... ...\n","495 NaN Lareva Branden\n","496 NaN Bilberto Hernandez\n","497 ANTONIOEHERNANDEZ@GMAIL.COM Antonio Hernandezpablo\n","498 NaN Bryan Baltazar\n","499 NaN Oscar Rendon\n","\n","[500 rows x 3 columns]\n"]}],"source":["# It seems you want to read a .txt file with space-tab (t) separator and ANSI encoding into a Pandas DataFrame, create a new DataFrame with specific values, and then print only the top 500 rows. I've corrected the code and provided the following snippet for your task:\n","\n","\n","# My apologies for the error. It seems there's a mismatch between the number of columns in the header and the number of columns in the provided values. To fix this, we should ensure that the number of columns in the header matches the number of values you want to insert. Here's the corrected code:\n","\n","\n","import pandas as pd\n","\n","# Define the input .txt file path\n","input_txt_file = '/content/drive/MyDrive/projects/dataset/oc-voter-list/master/cntywd02102023.txt'\n","\n","# Define data types for the columns\n","dtypes = {\n"," 'szEmailAddress': str,\n"," 'szNameFirst': str,\n"," 'szNameLast': str,\n","}\n","\n","# Read the data from the .txt file with specified data types\n","df = pd.read_csv(input_txt_file, sep='\t', encoding='ISO-8859-1', dtype=dtypes)\n","\n","# Remove rows with 'NaN' or '...' in the 'szEmailAddress' column\n","df = df[df['szEmailAddress'].apply(lambda x: x not in ['NaN', '...'])]\n","\n","# Print the top 500 rows of the specified columns\n","print(df[['szEmailAddress', 'szNameFirst', 'szNameLast']].head(500))\n","\n","\n","\n","\n","\n","\n"]},{"cell_type":"markdown","metadata":{"id":"GPJgl2gU3a2i"},"source":["#Step #6: Print top 500 Rows and remove 'NAN' rows"]},{"cell_type":"code","execution_count":null,"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"elapsed":31871,"status":"ok","timestamp":1697144180993,"user":{"displayName":"Council Education","userId":"02038367057430065145"},"usertz":420},"id":"V8k4xvDC3aH","outputId":"89375e03-ed8b-4626-d592-493d6ebd77e6"},"outputs":[{"output_type":"stream","name":"stderr","text":[":14: DtypeWarning: Columns (13,18,19) have mixed types. Specify dtype option on import or set low_memory=False.\n"," df = pd.read_csv(input_txt_file, sep='\t', encoding='ISO-8859-1', dtype=dtypes)\n"]},{"output_type":"stream","name":"stdout","text":[" szEmailAddress szNameFirst szNameLast\n","1 brianschoonover@yahoo.com Brian Schoonover\n","2 brianschoonover@yahoo.com Julie Schoonover\n","3 ROSEMARYAVELLANEDA58@GMAIL.COM Rosemary Avellaneda\n","5 Janely.avellaneda@yahoo.com Janely Garduno\n","7 froggylover1984@yahoo.com Vilma Alcantara\n","... ... ... ...\n","1160 FLAKA.PRADO22@GMAIL.COM Beverly Prado\n","1161 JNOATHANSOLANOF@GMAOL.COM Jonathan Solanoaparicio\n","1166 monica_551@hotmail.com Monica Estrada\n","1171 victormsoto03@gmail.com Victor Soto\n","1172 juliavertiz@me.com Julia Vertiz\n","\n","[500 rows x 3 columns]\n"]}],"source":["import pandas as pd\n","\n","# Define the input .txt file path\n","input_txt_file = '/content/drive/MyDrive/projects/dataset/oc-voter-list/master/cntywd02102023.txt'\n","\n","# Define data types for the columns\n","dtypes = {\n"," 'szEmailAddress': str,\n"," 'szNameFirst': str,\n"," 'szNameLast': str,\n","}\n","\n","# Read the data from the .txt file with specified data types\n","df = pd.read_csv(input_txt_file, sep='\t', encoding='ISO-8859-1', dtype=dtypes)\n","\n","# Remove rows with 'NaN' values in the 'szEmailAddress' column\n","df = df.dropna(subset=['szEmailAddress'])\n","\n","# Print the top 500 rows of the specified columns\n","print(df[['szEmailAddress', 'szNameFirst', 'szNameLast']].head(500))\n","\n","\n","\n"]},{"cell_type":"markdown","source":["#Step #7: Total count of szEmailAddress rows"],"metadata":{"id":"eC1h_SOBrZ2L"}},{"cell_type":"code","source":["\n","\n","import pandas as pd\n","\n","# Define the file path to your dataset\n","input_csv_file = '/content/drive/MyDrive/projects/dataset/oc-voter-list/campaigns/nbc#003/NBC0003-11-10-2023-f-update.csv'\n","output_txt_file = '/content/drive/MyDrive/projects/dataset/oc-voter-list/campaigns/nbc#003/NBC0003-11-10-2023-test.txt'\n","\n","# Read the CSV file into a Pandas DataFrame\n","df = pd.read_csv(input_csv_file, on_bad_lines='skip', delimiter=\"\t\")\n","\n","# Calculate the total count of 'szEmailAddress' rows\n","# total_count_szEmailAddress = df[['szEmailAddress']].count()\n","print(df.columns)\n","print(df.shape)\n","total_count_szEmailAddress = df.shape[0]\n","\n","print(f'Total count of szEmailAddress rows: {total_count_szEmailAddress}')\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"executionInfo":{"status":"ok","timestamp":1697145599726,"user_tz":420,"elapsed":1434,"user":{"displayName":"Council Education","userId":"02038367057430065145"}},"outputId":"fc72a68c-f47f-441b-9c09-191cbf1bae67","id":"lX5T1fCKQdnA"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stdout","text":["Index(['szEmailAddress', 'szNameFirst', 'szNameLast'], dtype='object')\n","(832368, 3)\n","Total count of szEmailAddress rows: 832368\n"]}]},{"cell_type":"markdown","source":["# Step #8: Sort out the top 5,000 rows for Mautic Export\n"],"metadata":{"id":"-DW2qOGjtS9t"}},{"cell_type":"code","source":["import pandas as pd\n","\n","# Define the input .txt file path\n","input_txt_file = '/content/drive/MyDrive/projects/dataset/oc-voter-list/master/cntywd02102023.txt'\n","\n","# Define data types for the columns\n","dtypes = {\n"," 'szEmailAddress': str,\n"," 'szNameFirst': str,\n"," 'szNameLast': str,\n","}\n","\n","# Read the data from the .txt file with specified data types\n","df = pd.read_csv(input_txt_file, sep='\t', encoding='ISO-8859-1', dtype=dtypes)\n","\n","# Remove rows with 'NaN' values in the 'szEmailAddress' column\n","df = df.dropna(subset=['szEmailAddress'])\n","\n","# Print and export only the top 5,000 rows of the specified columns\n","output_csv_file = '/content/drive/MyDrive/projects/dataset/oc-voter-list/campaigns/nbc#003/NBC0003-12-10-2023-c-mautic.csv'\n","df[['szEmailAddress', 'szNameFirst', 'szNameLast']].head(5000).to_csv(output_csv_file, index=False)\n","\n","# Calculate the total count of 'szEmailAddress' rows in the extracted data\n","total_count_szEmailAddress = df.shape[0]\n","\n","print(f'First 5,000 rows extracted and saved to {output_csv_file}')\n","print(f'Total count of szEmailAddress rows in the extracted data: {total_count_szEmailAddress}')\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"dlcTuNFctSY7","executionInfo":{"status":"ok","timestamp":1697145279847,"user_tz":420,"elapsed":43075,"user":{"displayName":"Council Education","userId":"02038367057430065145"}},"outputId":"84800f07-f937-4af3-d503-46f4d69e58ac"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":[":14: DtypeWarning: Columns (13,18,19) have mixed types. Specify dtype option on import or set low_memory=False.\n"," df = pd.read_csv(input_txt_file, sep='\t', encoding='ISO-8859-1', dtype=dtypes)\n"]},{"output_type":"stream","name":"stdout","text":["First 5,000 rows extracted and saved to /content/drive/MyDrive/projects/dataset/oc-voter-list/campaigns/nbc#003/NBC0003-12-10-2023-c-mautic.csv\n","Total count of szEmailAddress rows in the extracted data: 832374\n"]}]},{"cell_type":"markdown","source":["#Step #9 : df.head() | Charts | Interactive Table\n","\n","\n","\n"],"metadata":{"id":"LJJrHwrsr8Sn"}},{"cell_type":"code","source":["df.head()"],"metadata":{"colab":{"base_uri":"https://localhost:8080/","height":206},"id":"kcc9H6f8RrAx","executionInfo":{"status":"ok","timestamp":1697144321882,"user_tz":420,"elapsed":212,"user":{"displayName":"Council Education","userId":"02038367057430065145"}},"outputId":"9f825da4-6189-448e-8abe-3e4d6b09a6c2"},"execution_count":null,"outputs":[{"output_type":"execute_result","data":{"text/plain":[" szEmailAddress szNameFirst szNameLast\n","0 brianschoonover@yahoo.com Brian Schoonover\n","1 brianschoonover@yahoo.com Julie Schoonover\n","2 ROSEMARYAVELLANEDA58@GMAIL.COM Rosemary Avellaneda\n","3 Janely.avellaneda@yahoo.com Janely Garduno\n","4 froggylover1984@yahoo.com Vilma Alcantara"],"text/html":["\n"," <div id=\"df-eea2d4de-74cc-4137-9fa4-34db0dec8326\" class=\"colab-df-container\">\n","

\n","\n","<table border=\"1\" class=\"dataframe\">\n"," \n"," <tr style=\"text-align: right;\">\n"," \n"," szEmailAddress\n"," szNameFirst\n"," szNameLast\n"," \n"," \n"," \n"," \n"," 0\n"," td>brianschoonover@yahoo.com</td\n"," Brian\n"," Schoonover\n"," \n"," \n"," 1\n"," td>brianschoonover@yahoo.com</td\n"," Julie\n"," Schoonover\n"," \n"," \n"," 2\n"," td>ROSEMARYAVELLANEDA58@GMAIL.COM</td\n"," Rosemary\n"," Avellaneda\n"," \n"," \n"," 3\n"," td>Janely.avellaneda@yahoo.com</td\n"," Janely\n"," Garduno\n"," \n"," \n"," 4\n"," td>froggylover1984@yahoo.com</td\n"," Vilma\n"," Alcantara\n"," \n"," \n","\n","
\n"," <div class=\"colab-df-buttons\">\n","\n"," <div class=\"colab-df-container\">\n"," <button class=\"colab-df-convert\" onclick=\"convertToInteractive('df-eea2d4de-74cc-4137-9fa4-34db0dec8326')\"\n"," title=\"Convert this dataframe to an interactive table.\"\n"," style=\"display:none;\">\n","\n"," <svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\" viewBox=\"0 -960 960 960\">\n"," <path d=\"M120-120v-720h720v720H120Zm60-500h600v-160H180v160Zm220 220h160v-160H400v160Zm0 220h160v-160H400v160ZM180-400h160v-160H180v160Zm440 0h160v-160H620v160ZM180-180h160v-160H180v160Zm440 0h160v-160H620v160Z\"/>\n"," \n"," \n","\n"," \n","\n"," \n","
\n","\n","\n","<div id=\"df-8a21f0a7-7889-4462-b827-7005a2f20ac4\">\n"," <button class=\"colab-df-quickchart\" onclick=\"quickchart('df-8a21f0a7-7889-4462-b827-7005a2f20ac4')\"\n"," title=\"Suggest charts.\"\n"," style=\"display:none;\">\n","\n","<svg xmlns=\"http://www.w3.org/2000/svg\" height=\"24px\"viewBox=\"0 0 24 24\"\n"," width=\"24px\">\n"," \n"," <path d=\"M19 3H5c-1.1 0-2 .9-2 2v14c0 1.1.9 2 2 2h14c1.1 0 2-.9 2-2V5c0-1.1-.9-2-2-2zM9 17H7v-7h2v7zm4 0h-2V7h2v10zm4 0h-2v-4h2v4z\"/>\n"," \n","\n"," \n","\n","\n","\n"," \n","
\n"," \n"," \n"]},"metadata":{},"execution_count":9}]},{"cell_type":"markdown","source":["#Step #10 : Convert .txt file to Excel .xlsx file format"],"metadata":{"id":"WoMUw_lSt1xr"}},{"cell_type":"code","source":["import pandas as pd\n","\n","# Assuming you already have a DataFrame named 'df' with your data\n","\n","# Create a slice copy of the first 5,000 rows\n","df_copy = df.head(5000).copy()\n","\n","# Select and retain only the desired columns\n","columns_to_keep = ['szEmailAddress', 'szNameFirst', 'szNameLast']\n","df_copy = df_copy[columns_to_keep]\n","\n","# Now, df_copy contains the first 5,000 rows with only the specified columns\n","output_excel_file = '/content/drive/MyDrive/projects/dataset/oc-voter-list/campaigns/nbc#003/NBV000-12-10-2023.xlsx'\n","\n","# Save the DataFrame to an xlsx file using the 'openpyxl' engine\n","df_copy.to_excel(output_excel_file, engine='openpyxl')\n","\n"],"metadata":{"id":"gsCBP--ARlSa"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["#Step #11: Verify the OC Voter Registration List (szEmailAddress, szNameFirst, szNameLast) via Excel Pivot | path : NBC0003-11-10-2023-mautic.txt"],"metadata":{"id":"P3ZawGyLD9Iu"}},{"cell_type":"code","source":["import pandas as pd\n","\n","# Define the file paths\n","input_txt_file = '/content/drive/MyDrive/projects/dataset/oc-voter-list/master/cntywd02102023.txt'\n","output_csv_file = '/content/drive/MyDrive/projects/dataset/oc-voter-list/campaigns/nbc#003/NBC0003-11-10-2023-mautic.txt'\n","\n","# Read the entire .txt file into a Pandas DataFrame with the correct encoding\n","df = pd.read_csv(input_txt_file, error_bad_lines=False, delimiter=\"\t\", encoding='ISO-8859-1')\n","\n","# Save the DataFrame as a .csv file\n","df.to_csv(output_csv_file, index=False)\n","\n","# Calculate the total count of 'szEmailAddress' rows in the extracted data\n","total_count_szEmailAddress = df.shape[0]\n","\n","print(f'All rows extracted and saved to {output_csv_file}')\n","print(f'Total count of szEmailAddress rows in the extracted data: {total_count_szEmailAddress}')\n"],"metadata":{"colab":{"base_uri":"https://localhost:8080/"},"id":"8R4kJ576UTlr","executionInfo":{"status":"ok","timestamp":1697127752289,"user_tz":420,"elapsed":100120,"user":{"displayName":"Council Education","userId":"02038367057430065145"}},"outputId":"70adac42-7492-4d41-e0a2-5a3f1ffc55ca"},"execution_count":null,"outputs":[{"output_type":"stream","name":"stderr","text":[":8: FutureWarning: The error_bad_lines argument has been deprecated and will be removed in a future version. Use on_bad_lines in the future.\n","\n","\n"," df = pd.read_csv(input_txt_file, error_bad_lines=False, delimiter=\"\t\", encoding='ISO-8859-1')\n",":8: DtypeWarning: Columns (13,18,19) have mixed types. Specify dtype option on import or set low_memory=False.\n"," df = pd.read_csv(input_txt_file, error_bad_lines=False, delimiter=\"\t\", encoding='ISO-8859-1')\n"]},{"output_type":"stream","name":"stdout","text":["All rows extracted and saved to /content/drive/MyDrive/projects/dataset/oc-voter-list/campaigns/nbc#003/NBC0003-11-10-2023-mautic.txt\n","Total count of szEmailAddress rows in the extracted data: 1810164\n"]}]}],"metadata":{"colab":{"provenance":[],"mount_file_id":"1hCOG3iIDeViIZXoKYP7XRdYC5VzLX8Rz","authorship_tag":"ABX9TyPR2dIP7FbliRpgGz4X5Zq6"},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}