tidyverse / readr

Read flat files (csv, tsv, fwf) into R
https://readr.tidyverse.org
Other
999 stars 286 forks source link

read_csv changes periods in column names to spaces #130

Closed nguinasso closed 9 years ago

nguinasso commented 9 years ago

Test of readr Norman 2015-04-14

cltype <-'DcdcccccDcddcddc'
cltype1<-'cccccccccc'
cltype2<-'cccccccddc'
cltype3<-'cccccddccc'
cltype4<-'cccdDD'
cltyp<-paste(cltype,cltype1,cltype2,cltype3,cltype4,sep="")
str_length(cltyp)

## [1] 52

#(http://gulfsciencedata.bp.com/go/doctype/6145/179482/)
ocfile <- '../WaterChemistry_W-01v02-01.csv'
cltyp

## [1] "DcdcccccDcddcddccccccccccccccccccddccccccddccccccdDD"

ocfile

## [1] "../WaterChemistry_W-01v02-01.csv"

rm(bpwc,bpwc.h,bpwc.h1)

## Warning in rm(bpwc, bpwc.h, bpwc.h1): object 'bpwc' not found

## Warning in rm(bpwc, bpwc.h, bpwc.h1): object 'bpwc.h' not found

## Warning in rm(bpwc, bpwc.h, bpwc.h1): object 'bpwc.h1' not found

system.time(bpwc  <-read.csv(ocfile))

##    user  system elapsed 
##   91.67    1.94   93.61

library(readr)
system.time(bpwc.h  <-read_csv(ocfile,col_types=cltyp,prog=FALSE))

## 
|================================================================================| 100% 1565 MB

##    user  system elapsed 
##   25.63    0.55   26.18

head(problems(bpwc.h))

## [1] row      col      expected actual  
## <0 rows> (or 0-length row.names)

system.time(bpwc.h1  <-read_csv(ocfile,prog=FALSE))

## 
|================================================================================| 100% 1565 MB

## Warning: 11742041 problems parsing '../WaterChemistry_W-01v02-01.csv'. See
## problems(...) for more details.

##    user  system elapsed 
##   31.37    0.75   32.12

head(problems(bpwc.h1))

##     row col       expected       actual
## 1 12324  25     an integer K1009141-005
## 2 12324  28 T/F/TRUE/FALSE        Water
## 3 12324  29 T/F/TRUE/FALSE        Water
## 4 12324  49 T/F/TRUE/FALSE          SMP
## 5 12325  25     an integer K1009141-005
## 6 12325  28 T/F/TRUE/FALSE        Water

sessionInfo()

## R version 3.1.3 Patched (2015-03-16 r68170)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 7 x64 (build 7601) Service Pack 1
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] readr_0.1.0.9000 stringr_0.6.2   
## 
## loaded via a namespace (and not attached):
## [1] digest_0.6.4    evaluate_0.5.5  formatR_1.0     htmltools_0.2.6
## [5] knitr_1.9       Rcpp_0.11.5     rmarkdown_0.5.1 tools_3.1.3    
## [9] yaml_2.1.13

str(bpwc)

## 'data.frame':    2479957 obs. of  52 variables:
##  $ Data.Publication.Date         : Factor w/ 1 level "2014-05-30": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Data.Publication.Reference    : Factor w/ 1 level "Water Column Chemistry W-01v02-01": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Study.Reference.Number        : num  1022 1022 1022 1022 1022 ...
##  $ Study.Name                    : Factor w/ 97 levels "","American Diver Cruise 01 JUL 17-AUG 6 2010",..: 18 18 18 18 18 18 18 18 18 18 ...
##  $ Harmonized.Study.Name         : Factor w/ 67 levels "1-meter MOCNESS Plankton: September 2010 (Walton Smith I)",..: 21 21 21 21 21 21 21 21 21 21 ...
##  $ Harmonized.Cruise.ID          : Factor w/ 138 levels "American Diver 01 (07-21-10 to 08-05-10)",..: 83 83 83 83 83 83 83 83 83 83 ...
##  $ Location.or.Station.ID        : Factor w/ 1347 levels "","000-009","000-018",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Interpretive.Sample.ID        : Factor w/ 22772 levels "0V010011","0V010021",..: 17710 17710 17710 17710 17710 17710 17710 17710 17710 17710 ...
##  $ Sample.Date                   : Factor w/ 374 levels "2010-05-05","2010-05-06",..: 71 71 71 71 71 71 71 71 71 71 ...
##  $ Sample.Time                   : Factor w/ 1401 levels "00:00:00.0000000",..: 406 406 406 406 406 406 406 406 406 406 ...
##  $ Latitude                      : num  28.7 28.7 28.7 28.7 28.7 ...
##  $ Longitude                     : num  -88.5 -88.5 -88.5 -88.5 -88.5 ...
##  $ Spatial.Zone                  : Factor w/ 7 levels "AL","Federal",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ Upper.Depth                   : num  1467 1467 1467 1467 1467 ...
##  $ Lower.Depth                   : num  1467 1467 1467 1467 1467 ...
##  $ Depth.Unit                    : Factor w/ 2 levels "","m": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Field.Fraction                : Factor w/ 3 levels "Dissolved","Suspended particulate",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Sample.Type                   : Factor w/ 6 levels "Equipment blank",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ Field.Matrix                  : Factor w/ 2 levels "Solid (non-specific)",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Field.Sample.Material         : Factor w/ 4 levels "Filter from the Payne filtration method, 0.7um fiber glass",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Field.Data.Verification.Status: Factor w/ 3 levels "Complete","Not started",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Field.Data.Verification.Result: Factor w/ 9 levels "1 of 15 DQOs could not be verified: Coordinates correctness",..: 9 9 9 9 9 9 9 9 9 9 ...
##  $ Analytical.Sample.ID          : Factor w/ 26737 levels "0V010011","0V010021",..: 21145 21145 21145 21145 21145 21145 21145 21145 21145 21145 ...
##  $ Lab                           : Factor w/ 9 levels "Alpha Analytical",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ Laboratory.Sample.ID          : Factor w/ 34753 levels "1005011-01","1005011-01D",..: 14840 14840 14840 14840 14840 14840 14840 14840 14840 14840 ...
##  $ ASR.Number                    : Factor w/ 23 levels "","ARF 007-0",..: 20 20 20 20 20 20 20 20 20 20 ...
##  $ SDG                           : Factor w/ 2642 levels "10-0059","10-0061",..: 1265 1265 1265 1265 1265 1265 1265 1265 1265 1265 ...
##  $ Lab.Matrix                    : Factor w/ 3 levels "","Solid (non-specific)",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Lab.Material                  : Factor w/ 5 levels "","Filter (liquid)",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Parameter.Type                : Factor w/ 8 levels "Biomarker","BTEX/PIANO",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ Chemical.Name                 : Factor w/ 330 levels "1-Decene","1-Heptene/1,2-DMCP (trans)",..: 122 124 125 126 129 131 136 137 138 142 ...
##  $ Chemical.Code                 : Factor w/ 330 levels "100-41-4","100-42-5",..: 217 82 32 144 124 73 66 65 80 84 ...
##  $ Chemical.Type                 : Factor w/ 3 levels "Surrogate","Target analyte",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ Concentration...NDs.at.MDL    : num  0.0053 0.0053 0.0053 0.0053 0.0053 0.0053 0.0053 0.0053 0.0053 0.0053 ...
##  $ Concentration...NDs.at.zero   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Unit                          : Factor w/ 3 levels "pct","ug/L","ug/Samp": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Final.Qualifiers              : Factor w/ 11 levels "","F","J","JF",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ Validation.Qualifiers         : Factor w/ 15 levels "","F","FJ","j",..: 10 10 10 10 10 10 10 10 10 10 ...
##  $ Lab.Qualifiers                : Factor w/ 52 levels "","*","B","B,J",..: 44 44 44 44 44 44 44 44 44 44 ...
##  $ Nondetect.Flag                : Factor w/ 2 levels "N","Y": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Validation.Level              : Factor w/ 7 levels "Not Validated",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ Reporting.Limit               : num  0.053 0.053 0.053 0.053 0.053 0.053 0.053 0.053 0.053 0.053 ...
##  $ Method.Detection.Limit        : num  0.0053 0.0053 0.0053 0.0053 0.0053 0.0053 0.0053 0.0053 0.0053 0.0053 ...
##  $ Measurement.Basis             : Factor w/ 1 level "Wet": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Lab.Fraction                  : Factor w/ 2 levels "Suspended particulate",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Preparation.Method            : Factor w/ 17 levels "","3510C - Separatory funel liquid-liquid extraction",..: 15 15 15 15 15 15 15 15 15 15 ...
##  $ Analytical.Method             : Factor w/ 20 levels "8015B - Nonhalogenated Organics Using GC/FID",..: 20 20 20 20 20 20 20 20 20 20 ...
##  $ Base.Analytical.Method        : int  8270 8270 8270 8270 8270 8270 8270 8270 8270 8270 ...
##  $ Lab.Replicate                 : Factor w/ 4 levels "","1","LD","SMP": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Dilution.Factor               : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date.Extracted                : Factor w/ 358 levels "","2010-05-11",..: 70 70 70 70 70 70 70 70 70 70 ...
##  $ Date.Analyzed                 : Factor w/ 536 levels "","2010-05-17",..: 67 67 67 67 67 67 67 67 67 67 ...

str(bpwc.h)

## Classes 'tbl_df', 'tbl' and 'data.frame':    2479957 obs. of  52 variables:
##  $ Data Publication Date         : Date, format: "2014-05-30" "2014-05-30" ...
##  $ Data Publication Reference    : chr  "Water Column Chemistry W-01v02-01" "Water Column Chemistry W-01v02-01" "Water Column Chemistry W-01v02-01" "Water Column Chemistry W-01v02-01" ...
##  $ Study Reference Number        : num  1022 1022 1022 1022 1022 ...
##  $ Study Name                    : chr  "Deepwater Dispersant Sampling Program" "Deepwater Dispersant Sampling Program" "Deepwater Dispersant Sampling Program" "Deepwater Dispersant Sampling Program" ...
##  $ Harmonized Study Name         : chr  "Deepwater Dispersant Sampling Program" "Deepwater Dispersant Sampling Program" "Deepwater Dispersant Sampling Program" "Deepwater Dispersant Sampling Program" ...
##  $ Harmonized Cruise ID          : chr  "Ocean Veritas 09 (07-13-10 to 07-17-10)" "Ocean Veritas 09 (07-13-10 to 07-17-10)" "Ocean Veritas 09 (07-13-10 to 07-17-10)" "Ocean Veritas 09 (07-13-10 to 07-17-10)" ...
##  $ Location or Station ID        : chr  "" "" "" "" ...
##  $ Interpretive Sample ID        : chr  "SW-20100715-OV09-001" "SW-20100715-OV09-001" "SW-20100715-OV09-001" "SW-20100715-OV09-001" ...
##  $ Sample Date                   : Date, format: "2010-07-15" "2010-07-15" ...
##  $ Sample Time                   : chr  "07:22:00.0000000" "07:22:00.0000000" "07:22:00.0000000" "07:22:00.0000000" ...
##  $ Latitude                      : num  28.7 28.7 28.7 28.7 28.7 ...
##  $ Longitude                     : num  -88.5 -88.5 -88.5 -88.5 -88.5 ...
##  $ Spatial Zone                  : chr  "Wellhead" "Wellhead" "Wellhead" "Wellhead" ...
##  $ Upper Depth                   : num  1467 1467 1467 1467 1467 ...
##  $ Lower Depth                   : num  1467 1467 1467 1467 1467 ...
##  $ Depth Unit                    : chr  "m" "m" "m" "m" ...
##  $ Field Fraction                : chr  "Total" "Total" "Total" "Total" ...
##  $ Sample Type                   : chr  "Natural sample" "Natural sample" "Natural sample" "Natural sample" ...
##  $ Field Matrix                  : chr  "Water" "Water" "Water" "Water" ...
##  $ Field Sample Material         : chr  "Surface Water" "Surface Water" "Surface Water" "Surface Water" ...
##  $ Field Data Verification Status: chr  "Underway" "Underway" "Underway" "Underway" ...
##  $ Field Data Verification Result: chr  "Unassigned" "Unassigned" "Unassigned" "Unassigned" ...
##  $ Analytical Sample ID          : chr  "SW-20100715-OV09-001" "SW-20100715-OV09-001" "SW-20100715-OV09-001" "SW-20100715-OV09-001" ...
##  $ Lab                           : chr  "LLI" "LLI" "LLI" "LLI" ...
##  $ Laboratory Sample ID          : chr  "6035886" "6035886" "6035886" "6035886" ...
##  $ ASR Number                    : chr  "ASR 052-draft-0" "ASR 052-draft-0" "ASR 052-draft-0" "ASR 052-draft-0" ...
##  $ SDG                           : chr  "BMM26" "BMM26" "BMM26" "BMM26" ...
##  $ Lab Matrix                    : chr  "" "" "" "" ...
##  $ Lab Material                  : chr  "" "" "" "" ...
##  $ Parameter Type                : chr  "PAH/aPAH" "PAH/aPAH" "PAH/aPAH" "PAH/aPAH" ...
##  $ Chemical Name                 : chr  "Acenaphthene" "Acenaphthylene" "Anthracene" "Benz(a)anthracene" ...
##  $ Chemical Code                 : chr  "83-32-9" "208-96-8" "120-12-7" "56-55-3" ...
##  $ Chemical Type                 : chr  "Target Analyte" "Target Analyte" "Target Analyte" "Target Analyte" ...
##  $ Concentration - NDs at MDL    : num  0.0053 0.0053 0.0053 0.0053 0.0053 0.0053 0.0053 0.0053 0.0053 0.0053 ...
##  $ Concentration - NDs at zero   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Unit                          : chr  "ug/L" "ug/L" "ug/L" "ug/L" ...
##  $ Final Qualifiers              : chr  "U" "U" "U" "U" ...
##  $ Validation Qualifiers         : chr  "U" "U" "U" "U" ...
##  $ Lab Qualifiers                : chr  "U" "U" "U" "U" ...
##  $ Nondetect Flag                : chr  "Y" "Y" "Y" "Y" ...
##  $ Validation Level              : chr  "Validated Level4" "Validated Level4" "Validated Level4" "Validated Level4" ...
##  $ Reporting Limit               : num  0.053 0.053 0.053 0.053 0.053 0.053 0.053 0.053 0.053 0.053 ...
##  $ Method Detection Limit        : num  0.0053 0.0053 0.0053 0.0053 0.0053 0.0053 0.0053 0.0053 0.0053 0.0053 ...
##  $ Measurement Basis             : chr  "Wet" "Wet" "Wet" "Wet" ...
##  $ Lab Fraction                  : chr  "Total" "Total" "Total" "Total" ...
##  $ Preparation Method            : chr  "SW3510C" "SW3510C" "SW3510C" "SW3510C" ...
##  $ Analytical Method             : chr  "SW8270C_MOD" "SW8270C_MOD" "SW8270C_MOD" "SW8270C_MOD" ...
##  $ Base Analytical Method        : chr  "8270" "8270" "8270" "8270" ...
##  $ Lab Replicate                 : chr  "" "" "" "" ...
##  $ Dilution Factor               : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ Date Extracted                : Date, format: "2010-07-20" "2010-07-20" ...
##  $ Date Analyzed                 : Date, format: "2010-07-21" "2010-07-21" ...
hadley commented 9 years ago

Can you please supply a minimal reproducible example?

e.g. this works as I expect:

read_csv("a b,c.d
1, 2")

The period and space are preserved.