Read csv without unknow header and different number of fields each line

teocci commented 3 years ago

I have a file this this one:

2021-07-14T17:49:48.837,FE, 6, 0, 0, D,FF,BE,    42,mavlink_request_data_stream_t,req_message_rate,4,target_system,0,target_component,0,req_stream_id,3,start_stop,1,,sig ,Len,14,crc16,15714
2021-07-14T17:49:48.869,FE,1C, 0, 0,F0, 1, 1,    1E,mavlink_attitude_t,time_boot_ms,114029,roll,0.04656032,pitch,0.0197014,yaw,2.916162,rollspeed,0.0042041,pitchspeed,-0.00257009,yawspeed,-0.001730594,,sig ,Len,36,crc16,34405
2021-07-14T17:49:48.869,FE,1C, 0, 0,F1, 1, 1,    21,mavlink_global_position_int_t,time_boot_ms,114029,lat,357658398,lon,1274153234,alt,375210,relative_alt,17,vx,-27,vy,-28,vz,12,hdg,16708,,sig ,Len,36,crc16,35248
2021-07-14T17:49:48.874,FE,1F, 0, 0,F2, 1, 1,     1,mavlink_sys_status_t,onboard_control_sensors_present,325188655,onboard_control_sensors_enabled,308411439,onboard_control_sensors_health,326237231,load,160,voltage_battery,23569,current_battery,0,drop_rate_comm,0,errors_comm,0,errors_count1,0,errors_count2,0,errors_count3,0,errors_count4,0,battery_remaining,99,,sig ,Len,39,crc16,10226
2021-07-14T17:49:48.875,FE, 6, 0, 0,F3, 1, 1,    7D,mavlink_power_status_t,Vcc,5112,Vservo,2,flags,3,,sig ,Len,14,crc16,32691
2021-07-14T17:49:48.876,FE, 4, 0, 0,F4, 1, 1,    98,mavlink_meminfo_t,brkval,0,freemem,65535,freemem32,0,,sig ,Len,12,crc16,44665
2021-07-14T17:49:48.877,FE,1A, 0, 0,F5, 1, 1,    3E,mavlink_nav_controller_output_t,nav_roll,2.662952,nav_pitch,1.120603,alt_error,0,aspd_error,0,xtrack_error,0,nav_bearing,167,target_bearing,0,wp_dist,0,,sig ,Len,34,crc16,42532
2021-07-14T17:49:48.877,FE, 2, 0, 0,F6, 1, 1,    2A,mavlink_mission_current_t,seq,0,,sig ,Len,10,crc16,15872
2021-07-14T17:49:48.878,FE,14, 0, 0,F7, 1, 1,    4A,mavlink_vfr_hud_t,airspeed,0.006447488,groundspeed,0.3970219,alt,375.21,climb,-0.1277524,heading,167,throttle,0,,sig ,Len,28,crc16,63070
2021-07-14T17:49:48.878,FE,15, 0, 0,F8, 1, 1,    24,mavlink_servo_output_raw_t,time_usec,114029578,servo1_raw,982,servo2_raw,982,servo3_raw,982,servo4_raw,982,servo5_raw,982,servo6_raw,982,servo7_raw,0,servo8_raw,0,port,0,servo9_raw,0,servo10_raw,0,servo11_raw,0,servo12_raw,0,servo13_raw,0,servo14_raw,0,servo15_raw,0,servo16_raw,0,,sig ,Len,29,crc16,10946
2021-07-14T17:49:48.879,FE,2A, 0, 0,F9, 1, 1,    41,mavlink_rc_channels_t,time_boot_ms,114029,chan1_raw,1494,chan2_raw,1492,chan3_raw,982,chan4_raw,1494,chan5_raw,1299,chan6_raw,1488,chan7_raw,1493,chan8_raw,1486,chan9_raw,1494,chan10_raw,1494,chan11_raw,1494,chan12_raw,1494,chan13_raw,1494,chan14_raw,1494,chan15_raw,1494,chan16_raw,1494,chan17_raw,0,chan18_raw,0,chancount,16,rssi,0,,sig ,Len,50,crc16,42490
2021-07-14T17:49:48.879,FE,16, 0, 0,FA, 1, 1,    23,mavlink_rc_channels_raw_t,time_boot_ms,114029,chan1_raw,1494,chan2_raw,1492,chan3_raw,982,chan4_raw,1494,chan5_raw,1299,chan6_raw,1488,chan7_raw,1493,chan8_raw,1486,port,0,rssi,0,,sig ,Len,30,crc16,50926
2021-07-14T17:49:48.879,FE,1A, 0, 0,FB, 1, 1,    1B,mavlink_raw_imu_t,time_usec,114029629,xacc,48,yacc,-31,zacc,-997,xgyro,15,ygyro,-7,zgyro,-1,xmag,-342,ymag,-1,zmag,337,id,0,temperature,0,,sig ,Len,34,crc16,61209
2021-07-14T17:49:48.880,FE,16, 0, 0,FC, 1, 1,    74,mavlink_scaled_imu2_t,time_boot_ms,114029,xacc,40,yacc,-30,zacc,-997,xgyro,0,ygyro,-2,zgyro,-1,xmag,-302,ymag,-18,zmag,341,temperature,0,,sig ,Len,30,crc16,27531
2021-07-14T17:49:48.880,FE, E, 0, 0,FD, 1, 1,    1D,mavlink_scaled_pressure_t,time_boot_ms,114029,press_abs,970.946,press_diff,0,temperature,3822,temperature_press_diff,0,,sig ,Len,22,crc16,12037
2021-07-14T17:49:48.881,FE,1E, 0, 0,FE, 1, 1,    18,mavlink_gps_raw_int_t,time_usec,113820000,lat,357658394,lon,1274153298,alt,381980,eph,86,epv,132,vel,0,cog,25646,fix_type,3,satellites_visible,14,alt_ellipsoid,0,h_acc,0,v_acc,0,vel_acc,0,hdg_acc,0,yaw,0,,sig ,Len,38,crc16,56474
2021-07-14T17:49:48.882,FE,23, 0, 0,FF, 1, 1,    7C,mavlink_gps2_raw_t,time_usec,113820000,lat,357658428,lon,1274153213,alt,383180,dgps_age,0,eph,86,epv,132,vel,2,cog,31181,fix_type,3,satellites_visible,14,dgps_numch,0,yaw,0,alt_ellipsoid,0,h_acc,0,v_acc,0,vel_acc,0,hdg_acc,0,,sig ,Len,43,crc16,20400
2021-07-14T17:49:48.882,FE, C, 0, 0, 0, 1, 1,     2,mavlink_system_time_t,time_unix_usec,1626252579757269,time_boot_ms,114032,,sig ,Len,20,crc16,39761
2021-07-14T17:49:48.883,FE,1C, 0, 0, 1, 1, 1,    A3,mavlink_ahrs_t,omegaIx,-0.01148541,omegaIy,0.005339885,omegaIz,-0.0001667932,accel_weight,0,renorm_val,0,error_rp,0.003915358,error_yaw,0.003721569,,sig ,Len,36,crc16,12478
2021-07-14T17:49:48.883,FE,18, 0, 0, 2, 1, 1,    B2,mavlink_ahrs2_t,roll,0.02967097,pitch,0.02159006,yaw,2.961208,altitude,0,lat,0,lng,0,,sig ,Len,32,crc16,6036
2021-07-14T17:49:48.883,FE,28, 0, 0, 3, 1, 1,    B6,mavlink_ahrs3_t,roll,0.0465561,pitch,0.01969914,yaw,2.916158,altitude,375.21,lat,357658398,lng,1274153234,v1,0,v2,0,v3,0,v4,0,,sig ,Len,48,crc16,19848
2021-07-14T17:49:48.883,FE, 3, 0, 0, 4, 1, 1,    A5,mavlink_hwstatus_t,Vcc,5111,I2Cerr,0,,sig ,Len,11,crc16,54392
2021-07-14T17:49:48.883,FE,16, 0, 0, 5, 1, 1,    88,mavlink_terrain_report_t,lat,357658398,lon,1274153234,terrain_height,0,current_height,0,spacing,0,pending,504,loaded,112,,sig ,Len,30,crc16,63659
2021-07-14T17:49:48.883,FE, E, 0, 0, 6, 1, 1,    9E,mavlink_mount_status_t,pointing_a,0,pointing_b,-70,pointing_c,-52,target_system,0,target_component,0,,sig ,Len,22,crc16,53991
2021-07-14T17:49:48.884,FE,16, 0, 0, 7, 1, 1,    C1,mavlink_ekf_status_report_t,velocity_variance,0.08776879,pos_horiz_variance,0.02859282,pos_vert_variance,0.023573,compass_variance,0.02226898,terrain_alt_variance,0,flags,831,airspeed_variance,0,,sig ,Len,30,crc16,37206
2021-07-14T17:49:48.884,FE,1C, 0, 0, 8, 1, 1,    20,mavlink_local_position_ned_t,time_boot_ms,114035,x,-0.5938861,y,0.5027716,z,-0.01835009,vx,-0.2745549,vy,-0.2851341,vz,0.1275563,,sig ,Len,36,crc16,35136
2021-07-14T17:49:48.884,FE,20, 0, 0, 9, 1, 1,    F1,mavlink_vibration_t,time_usec,114035034,vibration_x,0.01895511,vibration_y,0.01639727,vibration_z,0.01650113,clipping_0,0,clipping_1,0,clipping_2,0,,sig ,Len,40,crc16,17895
2021-07-14T17:49:48.884,FE,24, 0, 0, A, 1, 1,    93,mavlink_battery_status_t,current_consumed,0,energy_consumed,0,temperature,32767,voltages,,current_battery,0,id,0,battery_function,0,type,0,battery_remaining,99,time_remaining,0,charge_state,0,voltages_ext,,,sig ,Len,44,crc16,27062
2021-07-14T17:49:48.885,FE,1C, 0, 0, B, 1, 1,    1E,mavlink_attitude_t,time_boot_ms,114279,roll,0.04652789,pitch,0.01981729,yaw,2.916214,rollspeed,0.004798831,pitchspeed,-0.002019421,yawspeed,-0.001487666,,sig ,Len,36,crc16,6788
2021-07-14T17:49:48.885,FE,1C, 0, 0, C, 1, 1,    21,mavlink_global_position_int_t,time_boot_ms,114279,lat,357658397,lon,1274153234,alt,375220,relative_alt,21,vx,-27,vy,-28,vz,12,hdg,16708,,sig ,Len,36,crc16,47284
2021-07-14T17:49:48.885,FE,1F, 0, 0, D, 1, 1,     1,mavlink_sys_status_t,onboard_control_sensors_present,325188655,onboard_control_sensors_enabled,308411439,onboard_control_sensors_health,326237231,load,155,voltage_battery,23572,current_battery,0,drop_rate_comm,0,errors_comm,0,errors_count1,0,errors_count2,0,errors_count3,0,errors_count4,0,battery_remaining,99,,sig ,Len,39,crc16,18819
2021-07-14T17:49:48.885,FE, 6, 0, 0, E, 1, 1,    7D,mavlink_power_status_t,Vcc,5107,Vservo,0,flags,3,,sig ,Len,14,crc16,25606
2021-07-14T17:49:48.885,FE, 4, 0, 0, F, 1, 1,    98,mavlink_meminfo_t,brkval,0,freemem,65535,freemem32,0,,sig ,Len,12,crc16,11693
2021-07-14T17:49:48.885,FE,1A, 0, 0,10, 1, 1,    3E,mavlink_nav_controller_output_t,nav_roll,2.66055,nav_pitch,1.128154,alt_error,0,aspd_error,0,xtrack_error,0,nav_bearing,167,target_bearing,0,wp_dist,0,,sig ,Len,34,crc16,11749
2021-07-14T17:49:48.885,FE, 2, 0, 0,11, 1, 1,    2A,mavlink_mission_current_t,seq,0,,sig ,Len,10,crc16,56192
2021-07-14T17:49:48.885,FE,14, 0, 0,12, 1, 1,    4A,mavlink_vfr_hud_t,airspeed,0.009835538,groundspeed,0.3958453,alt,375.22,climb,-0.126612,heading,167,throttle,0,,sig ,Len,28,crc16,52521
2021-07-14T17:49:48.885,FE,15, 0, 0,13, 1, 1,    24,mavlink_servo_output_raw_t,time_usec,114279570,servo1_raw,982,servo2_raw,982,servo3_raw,982,servo4_raw,982,servo5_raw,982,servo6_raw,982,servo7_raw,0,servo8_raw,0,port,0,servo9_raw,0,servo10_raw,0,servo11_raw,0,servo12_raw,0,servo13_raw,0,servo14_raw,0,servo15_raw,0,servo16_raw,0,,sig ,Len,29,crc16,26468
2021-07-14T17:49:48.886,FE,2A, 0, 0,14, 1, 1,    41,mavlink_rc_channels_t,time_boot_ms,114279,chan1_raw,1494,chan2_raw,1492,chan3_raw,982,chan4_raw,1494,chan5_raw,1299,chan6_raw,1488,chan7_raw,1493,chan8_raw,1486,chan9_raw,1494,chan10_raw,1494,chan11_raw,1494,chan12_raw,1494,chan13_raw,1494,chan14_raw,1494,chan15_raw,1494,chan16_raw,1494,chan17_raw,0,chan18_raw,0,chancount,16,rssi,0,,sig ,Len,50,crc16,60861
2021-07-14T17:49:48.886,FE,16, 0, 0,15, 1, 1,    23,mavlink_rc_channels_raw_t,time_boot_ms,114279,chan1_raw,1494,chan2_raw,1492,chan3_raw,982,chan4_raw,1494,chan5_raw,1299,chan6_raw,1488,chan7_raw,1493,chan8_raw,1486,port,0,rssi,0,,sig ,Len,30,crc16,14666
2021-07-14T17:49:48.886,FE,1A, 0, 0,16, 1, 1,    1B,mavlink_raw_imu_t,time_usec,114279615,xacc,48,yacc,-29,zacc,-998,xgyro,16,ygyro,-7,zgyro,-1,xmag,-341,ymag,-1,zmag,338,id,0,temperature,0,,sig ,Len,34,crc16,28862
2021-07-14T17:49:48.886,FE,16, 0, 0,17, 1, 1,    74,mavlink_scaled_imu2_t,time_boot_ms,114279,xacc,43,yacc,-29,zacc,-999,xgyro,-1,ygyro,-2,zgyro,0,xmag,-300,ymag,-23,zmag,341,temperature,0,,sig ,Len,30,crc16,65170
2021-07-14T17:49:48.886,FE, E, 0, 0,18, 1, 1,    1D,mavlink_scaled_pressure_t,time_boot_ms,114279,press_abs,970.9694,press_diff,0,temperature,3822,temperature_press_diff,0,,sig ,Len,22,crc16,37211
2021-07-14T17:49:48.886,FE,1E, 0, 0,19, 1, 1,    18,mavlink_gps_raw_int_t,time_usec,114220000,lat,357658391,lon,1274153298,alt,381980,eph,86,epv,132,vel,2,cog,25646,fix_type,3,satellites_visible,14,alt_ellipsoid,0,h_acc,0,v_acc,0,vel_acc,0,hdg_acc,0,yaw,0,,sig ,Len,38,crc16,36449
2021-07-14T17:49:48.886,FE,23, 0, 0,1A, 1, 1,    7C,mavlink_gps2_raw_t,time_usec,114220000,lat,357658426,lon,1274153210,alt,383150,dgps_age,0,eph,86,epv,132,vel,3,cog,31181,fix_type,3,satellites_visible,14,dgps_numch,0,yaw,0,alt_ellipsoid,0,h_acc,0,v_acc,0,vel_acc,0,hdg_acc,0,,sig ,Len,43,crc16,38274
2021-07-14T17:49:48.886,FE, C, 0, 0,1B, 1, 1,     2,mavlink_system_time_t,time_unix_usec,1626252580007199,time_boot_ms,114281,,sig ,Len,20,crc16,28243
2021-07-14T17:49:48.886,FE,1C, 0, 0,1C, 1, 1,    A3,mavlink_ahrs_t,omegaIx,-0.01150851,omegaIy,0.005350895,omegaIz,-0.0001637726,accel_weight,0,renorm_val,0,error_rp,0.003739818,error_yaw,0.003369438,,sig ,Len,36,crc16,53172
2021-07-14T17:49:48.886,FE,18, 0, 0,1D, 1, 1,    B2,mavlink_ahrs2_t,roll,0.02958553,pitch,0.0215758,yaw,2.961349,altitude,0,lat,0,lng,0,,sig ,Len,32,crc16,64559
2021-07-14T17:49:48.886,FE,28, 0, 0,1E, 1, 1,    B6,mavlink_ahrs3_t,roll,0.04652454,pitch,0.01981941,yaw,2.916214,altitude,375.22,lat,357658397,lng,1274153234,v1,0,v2,0,v3,0,v4,0,,sig ,Len,48,crc16,11276
2021-07-14T17:49:48.887,FE, 3, 0, 0,1F, 1, 1,    A5,mavlink_hwstatus_t,Vcc,5107,I2Cerr,0,,sig ,Len,11,crc16,23168

The structure of the file has a clear pattern for example: 2021-07-14T17:49:48.883,FE, 3, 0, 0, 4, 1, 1, A5,mavlink_hwstatus_t,Vcc,5111,I2Cerr,0,,sig ,Len,11,crc16,54392 in this line we can see we have the date we will call that GPSTime the we have seven hex digits that I don't know what they are or represent and I don't want to capture them Lets name them as Str02 until Str08. Then we have the MessageID it is a hex digit but it has 4 or 5 spaces in front. Then we have the MessageName and finally the MessageData where it should be parsed this Vcc,5111,I2Cerr,0,,sig ,Len,11,crc16,54392

This is the struct I made

type RawCSVData struct {
    GPSTime     string            `json:"gps_time" csv:"gps_time"`
    Str02       string            `json:"str_02" csv:"str_02"`
    Str03       string            `json:"str_03" csv:"str_03"`
    Str04       string            `json:"str_04" csv:"str_04"`
    Str05       string            `json:"str_05" csv:"str_05"`
    Str06       string            `json:"str_06" csv:"str_06"`
    Str07       string            `json:"str_07" csv:"str_07"`
    Str08       string            `json:"str_08" csv:"str_08"`
    MessageID   string            `json:"msg_id" csv:"msg_id"`
    MessageName string            `json:"msg_name" csv:"msg_name"`
    MessageData map[string]string `json:"-" csv:"-"`
}

I followed your example:

var csvHeader []string

func init() {
    h, err := csvutil.Header(RawCSVData{}, "csv")
    if err != nil {
        log.Fatal(err)
    }
    csvHeader = h
}

func main() {
    data := []byte(`
2021-07-14T17:49:48.883,FE, 3, 0, 0, 4, 1, 1,    A5,mavlink_hwstatus_t,Vcc,5111,I2Cerr,0,,sig ,Len,11,crc16,54392
2021-07-14T17:49:48.885,FE, 6, 0, 0, E, 1, 1,    7D,mavlink_power_status_t,Vcc,5107,Vservo,0,flags,3,,sig ,Len,14,crc16,25606`)

    r := csv.NewReader(bytes.NewReader(data))

    dec, err := csvutil.NewDecoder(r, csvHeader...)
    if err != nil {
        log.Fatal(err)
    }

    var data []RawCSVData
    for {
        var u RawCSVData

        if err := dec.Decode(&u); err == io.EOF {
            break
        } else if err != nil {
            log.Fatal(err)
        }

        for _, i := range dec.Unused() {
            fmt.Println(i)
        }

        data = append(data , u)
    }

    fmt.Printf("%+v", data )
}

When I run this code I got this error:

2021/10/20 14:09:52 wrong number of fields in record

There is any way to process this kind of csv file using this library? There are three features that I need to process this file:

Process each line based on the RawCSVData struct: this means the first 10 fields or colums
Columns that were not defined in the struct should be omitted or stored in the dec.Unused() so we can process them as we need.
A tag to omit consecutive columns probably like regex omit{7} in this case is a simple struct of 15 fields. But Imagine you have a file with 100 columns and you want to omit 90 of them. This means, you need to make an struct with 100 fields and use csv:"-" in 90 of them. I think is insane. It will be nice also to drop data that is not defined in the struct. For example if you have a file with rows like this:
```
1478,2021-08-25 13:10:07.643,POS,38829326,35.7299019,127.353324,296.3,-0.2403892,-0.3103892,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
```
I also got 2021/10/20 14:09:52 wrong number of fields in record At the end I created a new file and delete all the ,,,,,,,,,,,,, but this is not a good practice. That normalization might compromise the data.

amissu89 commented 3 years ago

oh, yes. I had a similar problem. please save me...

teocci commented 3 years ago

Did you @amissu89 find a solution?

jszwec commented 3 years ago

No, this is not a use case for csvutil. Your csv input requires to have the same number of fields in each row, otherwise mapping gets really complicated/ambiguous. If your rows were identical in size we could solve your header problem.

csvutil works on top of https://pkg.go.dev/encoding/csv, you will probably be better off using it directly. You will need to set FieldsPerRecord to -1 otherwise it will give you the same error (wrong number of fields)

teocci commented 3 years ago

I parsed line by line using regex:

const regexRawNormalizer = `(?P<header>^[0-9].*\s{4}(?P<id>[[:xdigit:]]+),mavlink_[a-z_0-9]*),(?P<data>.*),,(?P<suffix>sig\s.*)`

There is no other way to parse this

jszwec commented 3 years ago

I don't think that's true - https://play.golang.org/p/SzXidR00Q2T You can easily process exactly what you need with it

I believe I can close this issue now

jszwec / csvutil

Read csv without unknow header and different number of fields each line #44