Could be more relative

https://github.com/Transport-for-the-North/NorMITs-Demand/blob/fc6da8507407b57cbb802228ac449b1a7f584f43/normits_demand/models/attraction_model.py#L314
Attraction Model for NorMITs Demand
"""
# Modules required
import os # File operations
from typing import List

import pandas as pd # Bread & butter
import numpy as np

from normits_demand.utils import utils as nup # Folder build utils

_default_msoa_lad = ('Y:/NorMITs Synthesiser/import/lad_to_msoa.csv')
_default_attraction_weights = 'attraction_weights.csv'
_default_attraction_mode_split = 'attraction_mode_split.csv'
_default_ntem = ('Y:/NorMITs Synthesiser/import/' +
                 'ntem_constraints/ntem_pa_ave_wday_2018.csv')

class AttractionModel:

    """
    NorMITs Attraction model.
    Attraction end of NorMITs Trip End Model in the NorMITs Demand siote.
    """

    def __init__(self,
                 model_name: str = 'test',
                 build_folder: str = 'Y:/NorMITs Synthesiser',
                 iteration: str = 'iterx',
                 trip_origin: str = 'hb',
                 input_zones: str = 'msoa',
                 output_zones: str = 'test',
                 import_folder: str = 'Y:/NorMITs Synthesiser/import', # Add attraction data
                 model_folder: str = 'Y:/',
                 output_segments: List[str] = ['p','m'],
                 attractions_name: str = 'non_freight_msoa_2018.csv',
                 attraction_weights = _default_attraction_weights,
                 attraction_mode_split = _default_attraction_mode_split,
                 lad_path: str = _default_msoa_lad,
                 ntem_control: bool = True,
                 ntem_path: str = _default_ntem,
                 k_factor_control: bool = False,
                 k_factor_path = None,
                 export_msoa: bool = False,
                 export_lad: bool = False,
                 export_uncorrected: bool = False,
                 export_target: bool = True):

        # File Paths

        # Model setup variables
        self.model_name = model_name
        self.build_folder = build_folder
        self.iteration = iteration
        self.trip_origin = trip_origin
        self.input_zones = input_zones
        self.output_zones = output_zones
        # attraction data has its own folder in import
        self.import_folder = os.path.join(import_folder, 'attraction_data')
        self.model_folder = model_folder

        # Model option variables
        self.output_segments = output_segments
        self.attractions_name = attractions_name
        self.attraction_weights = attraction_weights
        self.ntem_control = ntem_control      
        if ntem_path == 'default':
            ntem_path = _default_ntem
        self.ntem_path = ntem_path
        # TODO: self.lad_path = lad_path Need this???
        self.k_factor_control = k_factor_control
        self.k_factor_path = k_factor_path
        self.export_msoa = export_msoa
        self.export_lad = export_lad
        self.export_uncorrected = export_uncorrected
        self.export_target = export_target

    def get_attractions(self,
                        source = 'flat',
                        ):
        """
        Imports raw attraction data from either a flat file on the the Y://
        or direct from the SQL server.

        Parameters
        ----------
        source:
            Where to look for the data. Will take 'flat' or 'sql', now just
            flat.

        flat_source:
            Path to flat source. Just points at global import path.

        Returns
        ----------
        all_msoa:
            All attractions by modelling employment types at msoa level.
        """
        flat_source = os.path.join(self.import_folder,
                                   self.attractions_name)

        # Blocking out most of this so it just imports everything.
        # BACKLOG: Adapt to handle the other 2 required datasets. Process if needed.
        all_msoa = pd.read_csv(flat_source)

        all_msoa = all_msoa.rename(columns={'geo_code':'msoa_zone_id'})

        return(all_msoa)

    def profile_attractions(msoa_attractions,
                            attr_profile):
        """
        Takes an attraction profile defined by an import file in a model folder
        and creates attractions to those segments.

        Parameters
        ----------
        model_folder:
            Takes a model folder to look for an internal area definition.

        msoa_attractions:
            DataFrame containing msoa attractions by E segments, as imported from
            HSL data.

        Returns
        ----------
        a_list:
            A list of dataframe for each attraction type.    
        """

        # Build all job list for iteration
        all_jobs = list(msoa_attractions)
        all_jobs.remove('msoa_zone_id')

        # Count purposes
        a_count = list(attr_profile['purpose'])

        # Build placeholder
        a_list = []

        # Loop over and weight
        for attraction in a_count:
            # Make a copy of attractions as placeholder
            attr_data = msoa_attractions.copy()

            # Get unique weighting to apply
            weightings = attr_profile[attr_profile['purpose'] == attraction]

            for job in all_jobs:
                print(job)
                this_weight = weightings[job]

                attr_data.loc[:,job] = attr_data.loc[:,job]*float(this_weight)
                # TODO: Some audits to make sure nothing is falling out

            attr_data.loc[:,attraction] = attr_data[all_jobs].sum(axis=1)
            attr_data = attr_data.drop(all_jobs, axis=1)

            a_list.append(attr_data)

        return(a_list)

    def aggregate_to_model_zones_attr(attractions,
                                      model_zone_lookup_path,
                                      translation_name = None,
                                      max_level=True):
        """
        Function to aggregate to a target model zoning system. Translates from msoa
        base to whatever translation you give it.
        In future, will need control for splitting and custom zones included.

        Parameters
        ----------
        attractions:
            DataFrame, or a list of DataFrames describing attractions, likely to a 
            given attraction segmentation.

        model_zone_lookup_path:
            Flat file containing correspondence factors between msoa and target
            model zoning system.

        max_level = True:
            If True, drops columns used to calculate employment totals.

        Returns
        ----------
        attractions_w_zones:
            DataFrame or list of attractions as input, aggregated to model zones.
        """

        print(attractions['attractions'].sum())
        # TODO: This is where the model index comes in - would be nice to define and lookup automatically
        # Partially solved by the above but ALSO:
        # TODO: Need to figure out relative columning, which is a zoning translation fix
        # e.g Norms = model 1
        model_zone_conversion = pd.read_csv(model_zone_lookup_path)

        major_zone_name = list(model_zone_conversion)[0]
        # ia_name needs to pick up the minor zone name, should do usually
        minor_zone_name = list(model_zone_conversion)[1]
        # Count unique model zones in minor zoning system
        unq_minor_zones = model_zone_conversion[minor_zone_name].drop_duplicates()
        print(unq_minor_zones)

        # If there are zone splits - just go with the zone that overlaps the most
        # Slightly cheaty and I think can be fixed by multiplying out.
        # TODO: Replace with multiply out method

        # Reindex
        mzcCols = [major_zone_name, minor_zone_name, translation_name]
        model_zone_conversion = model_zone_conversion.reindex(mzcCols, axis=1)
        # Need another function to pick the biggest zone in a split & make sure it's not in the internal area
        # Print totals to topline.

        # Reapply zonal splits to each
        # Msoa is hard coded

        # Workaround for minor zone sometimes being major
        if 'msoa' in major_zone_name:
            group_col = minor_zone_name
        elif 'msoa' in minor_zone_name:
            group_col = major_zone_name

        # get group cols
        group_cols = [group_col]
        for col in list(attractions):
            if col != 'msoa_zone_id' and col != 'attractions':
                group_cols.append(col)

        # get sum cols
        sum_cols = group_cols.copy()
        sum_cols.append('attractions')

        attractions = attractions.merge(model_zone_conversion,
                                        how='inner',
                                        on='msoa_zone_id')

        attraction_w_zones = attractions.copy()

        attractions_w_zones = attractions[
                'attractions'] * attractions[translation_name]

        attractions_w_zones = attraction_w_zones.reindex(
                sum_cols,
                axis=1).groupby(
                        group_cols).sum(
                                ).reset_index()

        print(attractions_w_zones['attractions'].sum())

        return(attractions_w_zones)

    def run(self):

        """
        Function to run the attraction model. Takes a path to a lookup folder
        containing

        Parameters
        ----------
        model_folder = _zone_lookup_folder:
            Lookup folder for target model zoning system. Should contain the
            attraction splits required.

        input_zones_name:
            Name for aggregation in input data. Should be MSOA or LSOA.

        output_zones_name:
            Name for aggregation in output data. Should be the name of the target
            model zone.

        segmentation_type:
            'tfn' or 'ntem'. Defines which segments to write attractions for.
            If 'tfn', will build SOC splits into output attractions.
            If 'ntem', will build for 8 NTEM purposes only.

        model_zone_lookup_path:
            Path to area weighted correspondence between input and output zoning.
            Should be attraction weighting in future.

        write_output = True:
            Write csvs out or not.

        Returns
        ----------
        attractions:
            List of DataFrames containing attractions, split by attraction type.
            At ONS geography provided at input.

        all_attr:
            Single DataFrame with all attraction splits side by side. At ONS
            geography provided at input.

        zonal_attractions:
            List of DataFrames containing attractions, split by attraction type.
            At target model zoning system.

        all_zonal_attr:
            Single DataFrame with all attraction splits side by side.
            At target model zoning system.

        """
        # Define model zone lookup path
        model_zone_lookup_path = (self.model_folder +
                                  '/' +
                                  self.output_zones.lower() +
                                  '_msoa_emp_weighted_lookup.csv')

        # Set output params dict    
        if 'soc' in self.output_segments:
            print('Running soc weighting')
            # If there are TfN segments import the segmentation
            # BACKLOG: Could be more relative
            soc_weighted_jobs = pd.read_csv((self.import_folder +
                                             'soc_2_digit_sic_2018.csv'))

            # Make the soc integer a category
            soc_weighted_jobs['soc_class'] = 'soc' + soc_weighted_jobs[
                    'soc_class'].astype(int).astype(str)

            # Get uniq socs
            soc_segments = soc_weighted_jobs[
                    'soc_class'].drop_duplicates().reset_index(drop=True)

            # We should apply the SOC attractions as a zonal weight
            # not use it as a total jobs number
            # this will give us the benefit of model purposes in HSL data
            skill_index = ['msoa_zone_id', 'soc_class', 'seg_jobs']
            skill_group = skill_index.copy()
            skill_group.remove('seg_jobs')

            skill_weight = soc_weighted_jobs.reindex(
                    skill_index,
                    axis=1).groupby(skill_group).sum().reset_index()
            skill_weight = skill_weight.pivot(index='msoa_zone_id',
                                              columns='soc_class',
                                              values='seg_jobs').reset_index()
            skill_weight['total'] = skill_weight['soc1'] + skill_weight['soc2'] + skill_weight['soc3']

            # Reduce to factors
            for soc_seg in soc_segments:
                skill_weight[soc_seg] = skill_weight[soc_seg] / skill_weight['total']

            # Get rename total to soc0
            del(skill_weight['total'])

            # Rebuild soc segments with soc back in
            new_soc_segments = []
            for seg in soc_segments:
                new_soc_segments.append(seg)
            soc_segments = new_soc_segments.copy()
            del(new_soc_segments)

            # Define export segments
            group_segments = ['msoa_zone_id',
                              'p', 'm', 'soc']
            sum_segments = group_segments.copy()
            sum_segments.append('attractions')

        output_dir = os.path.join(self.build_folder,
                                  self.iteration,
                                  'Attraction Outputs')
        nup.create_folder(output_dir, chDir=False)

        print("Getting MSOA attractions")
        # Filters down to internal only here.
        # Uses global variables that aren't in the function call & shouldn't.
        msoa_attractions = self.get_attractions(source = 'flat')

        all_jobs = list(msoa_attractions)
        all_jobs.remove('msoa_zone_id')

        # Seed with all commute as E01
        msoa_attractions['E01'] = msoa_attractions[all_jobs].sum(axis=1)

        # Import attraction profile
        model_dir = os.listdir(self.model_folder)
        attr_profile = [x for x in model_dir if self.attraction_weights in x][0]

        # Profile attractions
        attr_profile = pd.read_csv(os.path.join(
            self.model_folder,
            attr_profile))

        # Profile and weight attractions
        attractions = self.profile_attractions(msoa_attractions, attr_profile)

        # BACKLOG: Move segmentation to after base is built
        if 'soc' in self.output_segments:

            # Split into segments to split and those to retain
            retain_ph = []
            split_ph = []

            soc_purposes = ['Commute', 'Business']

            for attr in attractions:
                attr_type = list(attr)[-1]
                print(attr_type)
                if attr_type in soc_purposes:
                    split_ph.append(attr)
                else:
                    retain_ph.append(attr)

            # soc_segments defined above

            tfn_attr = []
            for attr_frame in split_ph:
                purpose = list(attr_frame)[-1]
                for segment in soc_segments:
                    print('Building TfN segment ' +
                          purpose + '_' + segment)
                    # Copy skill weights
                    target_weights = skill_weight.copy()

                    # Reindex to target cols
                    target_weights = target_weights.reindex(
                            ['msoa_zone_id', segment], axis=1)

                    # Join the SOC weighting factors
                    ph_frame = attr_frame.copy()
                    ph_frame = ph_frame.merge(target_weights,
                                              how='left',
                                              on='msoa_zone_id')

                    # Build weighted attractions with SOC
                    soc_seg_name = (purpose + '_' + segment)
                    ph_frame[soc_seg_name] = ph_frame[
                            purpose] * ph_frame[
                                    segment]
                    ph_frame = ph_frame.reindex(['msoa_zone_id', soc_seg_name], axis=1)

                    # Append to placeholder
                    tfn_attr.append(ph_frame)
                    # END
                # END

            # Append first list to end of TfN list
            for attr in retain_ph:
                tfn_attr.append(attr)
            del(retain_ph)
            del(attractions)
            attractions = tfn_attr
            del(tfn_attr)

        # Concatenate and consolidate attractions into single table.
        all_attr = pd.concat(attractions, sort=True)
        all_attr = all_attr.fillna(0)
        all_attr = all_attr.groupby('msoa_zone_id').sum().reset_index()

        col_heads = list(all_attr)
        col_heads.remove('msoa_zone_id')

        all_attr = pd.melt(all_attr, id_vars='msoa_zone_id', value_vars=col_heads,
                       value_name = 'attractions', var_name = 'ph')

        p_dist = pd.DataFrame({'ph' :
            ['Commute', 'Commute_soc1', 'Commute_soc2', 'Commute_soc3',
             'Business', 'Business_soc1', 'Business_soc2', 'Business_soc3',
             'Education', 'Shopping', 'Personal_business',
             'Recreation_social', 'Visiting_friends', 'Holiday_day_trip'],
             'p':[1,1,1,1,
                  2,2,2,2,
                  3,4,5,6,7,8],
                  'soc':['0', '1', '2', '3',
                         '0', '1', '2', '3',
                         '0','0','0','0','0','0']
        })

        if'soc' not in self.output_segments:
            p_dist = p_dist.drop('soc', axis=1)

        # Attach segments
        all_attr = all_attr.merge(p_dist,
                                  how='left',
                                  on='ph')
        all_attr = all_attr.drop('ph', axis=1)

        # Attach msoa mode_splits
        mode_splits = pd.read_csv(
            os.path.join(self.import_folder,
                         self.attraction_mode_split))

        all_attr = all_attr.merge(mode_splits,
                                  how='left',
                                  on = ['msoa_zone_id', 'p'])

        all_attr['attractions'] = all_attr['attractions'] * all_attr['mode_share']
        all_attr = all_attr.drop('mode_share', axis=1)

        all_attr = all_attr.reindex(sum_segments,
                                    axis=1).groupby(group_segments).sum(
              ).reset_index()

        all_attr = all_attr.sort_values(
                group_segments).reset_index(drop=True)

        hb_attr = all_attr.copy()
        nhb_attr = all_attr.copy()

        # Rename hb purpose to nhb
        nhb_attr = nhb_attr[nhb_attr['p']!=1]
        nhb_attr = nhb_attr[nhb_attr['p']!=7]
        nhb_attr['p'] = nhb_attr['p']+10

        # In the hack infill time period in nhb
        tp_infill = pd.DataFrame({'ph':[1,1,1,1],
                                  'tp':[1,2,3,4]})
        nhb_attr['ph'] = 1
        nhb_attr = nhb_attr.merge(tp_infill,
                                  how = 'left',
                                  on = 'ph')
        nhb_attr = nhb_attr.drop('ph', axis=1)
        del(tp_infill)

        ntem_lad_lookup = pd.read_csv('Y:/NorMITs Synthesiser/import/lad_to_msoa.csv')

        if self.export_uncorrected:
            hb_attr.to_csv(
                os.path.join(
                    self.output_dir,
                    'hb_attractions_uncorrected.csv'),
                index=False)
            nhb_attr.to_csv(
                os.path.join(
                    self.output_dir,
                    'nhb_attractions_uncorrected.csv'),
                index=False)

        if self.ntem_control:
            # Do an NTEM adjustment

            ntem_totals = pd.read_csv(self.ntem_path)

            hb_attr, hb_adj, hb_audit, hb_lad = nup.control_to_ntem(hb_attr,
                                                                    ntem_totals,
                                                                    ntem_lad_lookup,
                                                                    group_cols = ['p', 'm'],
                                                                    base_value_name = 'attractions',
                                                                    ntem_value_name = 'Attractions',
                                                                    purpose = 'hb')
            print(hb_audit)

            nhb_attr, nhb_adj, nhb_audit, nhb_lad = nup.control_to_ntem(nhb_attr,
                                                                       ntem_totals,
                                                                       ntem_lad_lookup,
                                                                       group_cols = ['p', 'm', 'tp'],
                                                                       base_value_name = 'attractions',
                                                                       ntem_value_name = 'Attractions',
                                                                       purpose = 'nhb')

            if self.export_lad:
                hb_lad.to_csv(
                    os.path.join(output_dir,
                                 'lad_hb_attractions.csv'),
                    index=False)

                nhb_lad.to_csv(
                    os.path.join(
                        output_dir,
                        'lad_nhb_attractions.csv'),
                    index=False)
            print(nhb_audit)

        # Control to k factors
        if self.k_factor_paths is not None:
            # Adjust to k factor for hb
            lad_lookup = pd.read_csv(self.lad_path)

            k_factors = os.listdir(self.k_factor_paths)[0]
            k_factors = pd.read_csv(
                os.path.join(self.k_factor_paths,
                             k_factors))
            k_factors = k_factors.reindex(['lad_zone_id','p','m','tp','attr_k'],
                                          axis=1)

            hb_purpose = [1,2,3,4,5,6,7,8]
            nhb_purpose = [12,13,14,15,16,18]

            hb_k_factors = k_factors[k_factors['p'].isin(hb_purpose)]
            hb_k_factors = hb_k_factors.drop('tp', axis=1)
            nhb_k_factors = k_factors[k_factors['p'].isin(nhb_purpose)]

            # Get hb_adjustment factors
            lad_lookup = lad_lookup.reindex(['lad_zone_id', 'msoa_zone_id'], axis=1)

            hb_attr = hb_attr.merge(lad_lookup,
                                    how = 'left',
                                    on = 'msoa_zone_id')
            # Seed zero infill
            hb_attr['attractions'] = hb_attr['attractions'].replace(0,0.001)

            # Build LA adjustment
            adj_fac = hb_attr.reindex(['lad_zone_id',
                                       'p',
                                       'm',
                                       'attractions'], axis=1).groupby(
            ['lad_zone_id',
             'p',
             'm']).sum().reset_index()
            adj_fac = adj_fac.merge(hb_k_factors,
                                    how = 'left',
                                    on = ['lad_zone_id',
                                          'p',
                                          'm'])
            adj_fac['adj_fac'] = adj_fac['attr_k']/adj_fac['attractions']
            adj_fac = adj_fac.reindex(['lad_zone_id',
                                       'p',
                                       'm',
                                       'adj_fac'], axis=1)
            adj_fac['adj_fac'] = adj_fac['adj_fac'].replace(np.nan, 1)

            hb_attr = hb_attr.merge(adj_fac,
                                    how = 'left',
                                    on = ['lad_zone_id',
                                          'p',
                                          'm'])
            hb_attr['attractions'] = hb_attr['attractions'] * hb_attr['adj_fac']

            hb_attr = hb_attr.drop(['lad_zone_id','adj_fac'], axis=1)

            # BACKLOG: TP attr
            nhb_attr = nhb_attr.merge(lad_lookup,
                                    how = 'left',
                                    on = 'msoa_zone_id')
            nhb_attr['attractions'] = nhb_attr['attractions'].replace(0,0.001)

            # Build LA adjustment
            adj_fac = nhb_attr.reindex(['lad_zone_id',
                                       'p',
                                       'm',
                                       'tp',
                                       'attractions'], axis=1).groupby(
            ['lad_zone_id',
             'p',
             'm',
             'tp']).sum().reset_index()
            adj_fac = adj_fac.merge(nhb_k_factors,
                                    how = 'left',
                                    on = ['lad_zone_id',
                                          'p',
                                          'm',
                                          'tp'])
            adj_fac['adj_fac'] = adj_fac['attr_k']/adj_fac['attractions']
            adj_fac = adj_fac.reindex(['lad_zone_id',
                                       'p',
                                       'm',
                                       'tp',
                                       'adj_fac'], axis=1)
            adj_fac['adj_fac'] = adj_fac['adj_fac'].replace(np.nan, 1)

            nhb_attr = nhb_attr.merge(adj_fac,
                                      how = 'left',
                                      on = ['lad_zone_id',
                                            'p',
                                            'm',
                                            'tp'])
            nhb_attr['attractions'] = nhb_attr['attractions'] * nhb_attr['adj_fac']

            nhb_attr = nhb_attr.drop(['lad_zone_id','adj_fac'], axis=1)

        # Write input attractions
        if self.export_msoa == True:
            hb_attr.to_csv(
                os.path.join(
                    self.output_dir,
                    self.input_zones.lower(),
                    '_hb_attractions.csv'),
                index=False)
            nhb_attr.to_csv(
                os.path.join(
                    self.output_dir,
                    self.input_zones.lower(),
                    '_nhb_attractions.csv'),
                index=False)

        # Aggregate input productions to model zones.
        zonal_hb_attr = self.aggregate_to_model_zones_attr(
            hb_attr,
            model_zone_lookup_path,
            translation_name = 'overlap_msoa_split_factor',
            max_level=True)

        zonal_nhb_attr = self.aggregate_to_model_zones_attr(
            nhb_attr,
            model_zone_lookup_path,
            translation_name = 'overlap_msoa_split_factor',
            max_level=True)

        hb_out_path = os.path.join(
            self.output_dir,
            self.output_zones.lower() +
            '_hb_attractions.csv')
        nhb_out_path = os.path.join(
            self.output_dir,
            self.output_zones.lower() +
            '_nhb_attractions.csv')

        # Write output totals
        if self.export_target == True:
            zonal_hb_attr.to_csv(
                hb_out_path,
                index=False)
            zonal_nhb_attr.to_csv(
                nhb_out_path,
                index=False)

        return [hb_out_path, nhb_out_path], [zonal_hb_attr, zonal_nhb_attr]
 No newline at end of file
ew file mode 100644
ndex 0000000..e196ce6
++ b/normits_demand/models/distribution_model.py
65fa56ff8aa0362377d2335ee48f4909856d68d8
Transport-for-the-North / NorMITs-Demand

Could be more relative #746