samthiriot / gosp.dpp

the direct probabilistic pairing method for generation of synthetic populations
GNU General Public License v2.0
1 stars 0 forks source link

trying to generate billions of individuals when some frequencies are 0 #37

Closed samthiriot closed 6 years ago

samthiriot commented 6 years ago

When working on a case with many empty cells in frequency, some parameters lead to the generation of very big populations.

To reproduce the case, using INSEE data:

library(data.table)
library(devtools)
load_all()

dwellings_raw <- read.csv(
        file="~/projets/2017\ parcimonious\ iterated\ picking/application_lille/FD_LOGEMTZB_2014.txt", 
        header=T, 
        nrow=50000, 
        sep=";",
        check.names=FALSE
        #,
        #col_types = cols(b=col_factor())
        )

# INPER: nb personnes ménage
# NBPI: nb pieces logement
# SURF: surface logement

sample_dwellings <- gosp.dpp::create_sample(
                data=dwellings_raw,
                encoding = list(
                        # we provide no mapping
                       ),
                weight.colname="IPONDL"
                )

# free some memory
remove(dwellings_raw)

#CATL: categorie
#   1 : Résidences principales
#   2 : Logements occasionnels
#   3 : Résidences secondaires
#   4 : Logements vacants
#   Z : Hors logement ordinaire
#
# n'y mettre un foyer que si vacant
pdi <- create_degree_probabilities_table(
                data.frame(
                    'CATL=1'=c(0.0, 1.0),
                    'CATL=2'=c(1.0, 0.0),
                    'CATL=3'=c(1.0, 0.0),
                    'CATL=4'=c(1.0, 0.0),
                    'CATL=Z'=c(1.0, 0.0),
                    check.names=FALSE
                    )
                )

#
households_raw <- read.csv(
        file="~/projets/2017\ parcimonious\ iterated\ picking/application_lille/FD_INDCVIZB_2014.txt", 
        header=T, 
        nrow=10000, 
        sep=";",
        check.names=FALSE
        #,
        #col_types = cols(b=col_factor())
        )
sample_households <- gosp.dpp::create_sample(
                data=households_raw,
                encoding = list(
                        # we provide no mapping
                       ),
                weight.colname="IPONDI"
                )
remove(households_raw)

pdj <- create_degree_probabilities_table(
                data.frame(
                    'STOCD=00'=c(1.0, 0.000001),
                    'STOCD=10'=c(0.000001, 1.0),
                    'STOCD=21'=c(0.000001, 1.0),
                    'STOCD=22'=c(0.000001, 1.0),
                    'STOCD=23'=c(0.000001, 1.0),
                    'STOCD=30'=c(1.0, 0.000001),
                    'STOCD=ZZ'=c(1.0, 0.000001),
                    check.names=FALSE
                    ),
                norm=TRUE
                )

# STOCD
    # 00 : Logement ordinaire inoccupé
    # 10 : Propriétaire
    # 21 : Locataire ou sous-locataire d'un logement loué vide non HLM
    # 22 : Locataire ou sous-locataire d'un logement loué vide HLM
    # 23 : Locataire ou sous-locataire d'un logement loué meublé ou d'une chambre d'hôtel
    # 30 : Logé gratuitement
    # ZZ : Hors logement ordinaire

# TYPL
    # Type de logement
    # 1 : Maison
    # 2 : Appartement
    # 3 : Logement-foyer
    # 4 : Chambre d'hôtel
    # 5 : Habitation de fortune
    # 6 : Pièce indépendante (ayant sa propre entrée)
    # Z : Hors logement ordinaire

# INPER: nb personnes ménage

# SURF: surface logement
    # 6 5 3 4 7 1 2

pij <- create_matching_probabilities_table(
        normalise(
            data.frame(
                "SURF=1"=c(1.0, 1.0, 0.7, 0.4, 0.1, 0.1, 0.001, 0.001, 0.001, 0.001, 0.001, 1.0), 
                "SURF=2"=c(1.0, 1.0, 1.0, 0.7, 0.4, 0.1, 0.1, 0.001, 0.001, 0.001, 0.001, 0.0), 
                "SURF=3"=c(0.8, 1.0, 1.0, 1.0, 0.7, 0.4, 0.1, 0.1, 0.001, 0.001, 0.001, 0.0), 
                "SURF=4"=c(0.3, 0.8, 1.0, 1.0, 1.0, 0.7, 0.4, 0.1, 0.1, 0.001, 0.001, 0.0), 
                "SURF=5"=c(0.3, 0.3, 0.8, 1.0, 1.0, 1.0, 0.7, 0.4, 0.1, 0.1, 0.1, 0.0), 
                "SURF=6"=c(0.1, 0.3, 0.3, 0.8, 1.0, 1.0, 1.0, 0.7, 0.4, 0.1, 0.1, 0.0), 
                "SURF=7"=c(0.01,  0.1, 0.3, 0.3, 0.8, 1.0, 1.0, 1.0, 0.7, 0.4, 0.4, 0.0), 
                row.names=c("INPER=1", "INPER=2", "INPER=3",  "INPER=4",  "INPER=5",  "INPER=6",  "INPER=7",  "INPER=8",  "INPER=9",  "INPER=10",  "INPER=11", "INPER=Z"), 
                check.names=FALSE
                )
            )
        )

prepared <- matching.prepare(sample_dwellings, sample_households, pdi, pdj, pij) 

solved <- matching.solve(prepared, nA=50000, nB=40000, nu.A=1, phi.A=0, delta.A=1, gamma=1, delta.B=1, phi.B=1, nu.B=1, verbose=T)

solved$gen$hat.nB
[1] 7002050000
samthiriot commented 6 years ago

as a first step, added a verification before freezing the machine

> case <- matching.generate(solved, sample_dwellings, sample_households, verbose=TRUE)
starting generation
Error in matching.generate(solved, sample_dwellings, sample_households,  : 
  you are trying to generate a population of more than 1 million of individuals: case$gen$hat.nA=50000, case$gen$hat.nB=7002050000 .
That probably does not make much sense. It also would likely fail and freeze your system.
If you really want to do it, add parameter force=TRUE and cross fingers.
> 
samthiriot commented 6 years ago

modified the computation of errors so the solutions with large counts of entities are not prefered in case another solution is available