Open JoshVarty opened 5 years ago
I want to look at the label/class level lwlwrap scores. My plan:
xresnet18
on curated dataset. xresnet50
on curated dataset.xresnet152
and will have to run experiments directly on it.xresnet50
on curated dataset.xresnet50
on noisy dataset.The results seem very similar which is encouraging to me. I'll continue to work with the xresnet50 dataset.
xresnet18 results:
[('Squeak', 0.5150189461211108),
('Fill_(with_liquid)', 0.641576923076923),
('Mechanical_fan', 0.6460921165002798),
('Walk_and_footsteps', 0.6652010328852436),
('Tap', 0.6695439383482864),
('Hiss', 0.670070523871863),
('Water_tap_and_faucet', 0.7310608065799881),
('Accelerating_and_revving_and_vroom', 0.7312871449538118),
('Sink_(filling_or_washing)', 0.7428575737987503),
('Buzz', 0.7497855465452604),
('Motorcycle', 0.7510529100529103),
('Traffic_noise_and_roadway_noise', 0.7530077808901339),
('Scissors', 0.7530841871494045),
('Bus', 0.7538304787966867),
('Cutlery_and_silverware', 0.7627543720647171),
('Bathtub_(filling_or_washing)', 0.7650796524264266),
('Yell', 0.7732956099456099),
('Clapping', 0.7794168465333072),
('Frying_(food)', 0.7821780085988995),
('Microwave_oven', 0.7996936026936029),
('Slam', 0.800984126984127),
('Run', 0.8011654641654642),
('Dishes_and_pots_and_pans', 0.8027969209343371),
('Trickle_and_dribble', 0.8039832285115305),
('Male_speech_and_man_speaking', 0.8044223119875296),
('Stream', 0.8045324590938627),
('Sneeze', 0.8050191070580279),
('Chink_and_clink', 0.8074203722466133),
('Chirp_and_tweet', 0.8081142070085512),
('Waves_and_surf', 0.808392156862745),
('Printer', 0.8097025066436832),
('Cupboard_open_or_close', 0.8180582195582198),
('Chewing_and_mastication', 0.8329148071466912),
('Drip', 0.8395234223770808),
('Electric_guitar', 0.8430702260702262),
('Knock', 0.8446432022084197),
('Cricket', 0.8450158730158731),
('Tick-tock', 0.852866038282705),
('Screaming', 0.8550793650793651),
('Writing', 0.8566741095162149),
('Gasp', 0.8599206349206349),
('Crackle', 0.8602857142857143),
('Computer_keyboard', 0.8607021095701155),
('Raindrop', 0.8623224183224183),
('Car_passing_by', 0.8647777777777778),
('Meow', 0.8651060606060607),
('Drawer_open_or_close', 0.8659999999999999),
('Sigh', 0.8744360902255639),
('Marimba_and_xylophone', 0.8766666666666667),
('Male_singing', 0.8771174242424244),
('Female_speech_and_woman_speaking', 0.8788917748917751),
('Zipper_(clothing)', 0.8803645320197044),
('Crowd', 0.8881163820366856),
('Accordion', 0.8894405043341217),
('Keys_jangling', 0.8907037037037039),
('Gong', 0.8908465608465609),
('Race_car_and_auto_racing', 0.8914115646258505),
('Gurgling', 0.893542661639436),
('Whispering', 0.8978369592398099),
('Cheering', 0.9026666666666665),
('Child_speech_and_kid_speaking', 0.905143205100652),
('Bicycle_bell', 0.9144607152914672),
('Church_bell', 0.9175613275613277),
('Toilet_flush', 0.9215454545454547),
('Shatter', 0.9219138755980861),
('Female_singing', 0.9239761904761904),
('Glockenspiel', 0.9263716356107661),
('Hi-hat', 0.9271269841269842),
('Bark', 0.9320819725883664),
('Fart', 0.9349377578312708),
('Applause', 0.9363223443223442),
('Bass_guitar', 0.9388095238095236),
('Purr', 0.9397435897435897),
('Acoustic_guitar', 0.9408366013071895),
('Bass_drum', 0.9486666666666668),
('Harmonica', 0.9494432234432234),
('Burping_and_eructation', 0.9524836601307188),
('Finger_snapping', 0.9722222222222223),
('Strum', 0.9777777777777777),
('Skateboard', 0.9777777777777779)]
xresnet50 results
[('Squeak', 0.5796967476552911),
('Mechanical_fan', 0.6166423379208493),
('Fill_(with_liquid)', 0.6229496222731516),
('Tap', 0.6665766662187715),
('Walk_and_footsteps', 0.6819911816578484),
('Hiss', 0.6943801258927312),
('Accelerating_and_revving_and_vroom', 0.719671679197995),
('Cutlery_and_silverware', 0.7244573808258021),
('Bus', 0.7445696562914306),
('Motorcycle', 0.7603867243867246),
('Water_tap_and_faucet', 0.7635761455525605),
('Slam', 0.7636478632478633),
('Sink_(filling_or_washing)', 0.7715374637310121),
('Bathtub_(filling_or_washing)', 0.7729596908939016),
('Traffic_noise_and_roadway_noise', 0.7783834422657951),
('Chink_and_clink', 0.7795379487333949),
('Buzz', 0.7795703366255115),
('Printer', 0.7856553446553446),
('Trickle_and_dribble', 0.7874070897655804),
('Stream', 0.7886853002070394),
('Dishes_and_pots_and_pans', 0.7904616112762565),
('Scissors', 0.8006760276760277),
('Clapping', 0.8059344707580003),
('Gasp', 0.8070286195286195),
('Chirp_and_tweet', 0.8107546709056144),
('Run', 0.8116531788624813),
('Male_speech_and_man_speaking', 0.8131386890164481),
('Car_passing_by', 0.8182514029180697),
('Frying_(food)', 0.821039078784786),
('Waves_and_surf', 0.82202035002035),
('Yell', 0.8220937135682899),
('Cupboard_open_or_close', 0.8254031017715229),
('Meow', 0.8365361889572416),
('Screaming', 0.8365776315188082),
('Microwave_oven', 0.8448888888888891),
('Tick-tock', 0.8470364887098156),
('Drip', 0.8499790640394088),
('Drawer_open_or_close', 0.8570767195767197),
('Cricket', 0.8588571428571428),
('Female_speech_and_woman_speaking', 0.8590212380212381),
('Electric_guitar', 0.8607066745890275),
('Crackle', 0.8650843230843233),
('Computer_keyboard', 0.866425925925926),
('Knock', 0.8667259211376859),
('Marimba_and_xylophone', 0.8682222222222223),
('Gurgling', 0.8785294187850577),
('Writing', 0.8809215969215972),
('Gong', 0.882772967772968),
('Zipper_(clothing)', 0.882857142857143),
('Chewing_and_mastication', 0.8895165272799681),
('Sneeze', 0.8899356402032496),
('Sigh', 0.8953216374269006),
('Raindrop', 0.8985396825396825),
('Male_singing', 0.8995754066721809),
('Church_bell', 0.8997777777777778),
('Crowd', 0.8998240740740742),
('Toilet_flush', 0.9007936507936508),
('Cheering', 0.9035873015873015),
('Shatter', 0.9186984126984127),
('Female_singing', 0.9188271604938272),
('Fart', 0.9198444690537713),
('Applause', 0.9222539682539682),
('Hi-hat', 0.9230899035604917),
('Whispering', 0.9264180264180265),
('Race_car_and_auto_racing', 0.9271825396825397),
('Keys_jangling', 0.9324444444444445),
('Child_speech_and_kid_speaking', 0.9329189964157706),
('Harmonica', 0.9335910364145658),
('Glockenspiel', 0.9360442546583849),
('Bark', 0.9394685990338164),
('Bicycle_bell', 0.9412935323383085),
('Bass_guitar', 0.9416812865497076),
('Acoustic_guitar', 0.943079365079365),
('Burping_and_eructation', 0.951611111111111),
('Bass_drum', 0.9539130434782609),
('Purr', 0.96),
('Accordion', 0.9627659574468085),
('Finger_snapping', 0.9755555555555556),
('Skateboard', 0.98),
('Strum', 0.9844444444444443)]
Noisy Results:
[('Raindrop', 0.04436092818362065),
('Yell', 0.04709596802228852),
('Male_speech_and_man_speaking', 0.06077752907029983),
('Female_speech_and_woman_speaking', 0.06129906387991314),
('Tap', 0.07044646545586392),
('Run', 0.07426774336114877),
('Mechanical_fan', 0.12713501514712922),
('Bass_drum', 0.13488520985632177),
('Whispering', 0.143602781852499),
('Cupboard_open_or_close', 0.15220660985997603),
('Gurgling', 0.15424962213240476),
('Slam', 0.17783540173677712),
('Fill_(with_liquid)', 0.17970898416872713),
('Trickle_and_dribble', 0.19582481820173298),
('Waves_and_surf', 0.20398130848197554),
('Walk_and_footsteps', 0.21162738618664048),
('Printer', 0.22396371953285665),
('Drip', 0.2252067834721889),
('Electric_guitar', 0.23033393801316),
('Tick-tock', 0.23289241872973251),
('Strum', 0.233163901289192),
('Drawer_open_or_close', 0.2635682609147807),
('Microwave_oven', 0.26577928658905614),
('Meow', 0.2682685095822111),
('Sneeze', 0.27928309544437124),
('Keys_jangling', 0.2968212178435703),
('Writing', 0.30339122794798734),
('Child_speech_and_kid_speaking', 0.33160134075932535),
('Purr', 0.33201695075654847),
('Hiss', 0.3357226613948607),
('Marimba_and_xylophone', 0.3427948537227723),
('Clapping', 0.3461528850877272),
('Dishes_and_pots_and_pans', 0.34882385123915316),
('Shatter', 0.3588214570690273),
('Cutlery_and_silverware', 0.3619312163265543),
('Car_passing_by', 0.3688139246958086),
('Frying_(food)', 0.3843920685323417),
('Male_singing', 0.3863365816111013),
('Squeak', 0.40173804220050824),
('Gasp', 0.40545024420024417),
('Buzz', 0.4067923102075746),
('Stream', 0.42896697584856414),
('Cheering', 0.4321385768052455),
('Traffic_noise_and_roadway_noise', 0.4324368157632883),
('Fart', 0.4453632247712203),
('Hi-hat', 0.4485055295657091),
('Chink_and_clink', 0.4596125510045755),
('Sink_(filling_or_washing)', 0.46574733665661133),
('Accordion', 0.46696217494089826),
('Bus', 0.47356929859527025),
('Gong', 0.4747603425050182),
('Water_tap_and_faucet', 0.47566924742457084),
('Applause', 0.4757884662957798),
('Bathtub_(filling_or_washing)', 0.48014184921518915),
('Accelerating_and_revving_and_vroom', 0.48220425453948546),
('Glockenspiel', 0.4864583333333332),
('Scissors', 0.49082872682872697),
('Motorcycle', 0.4997266717070133),
('Crackle', 0.5062945641269248),
('Crowd', 0.51102090320301),
('Female_singing', 0.5204413177236709),
('Chewing_and_mastication', 0.5283894207157367),
('Zipper_(clothing)', 0.5311896621896623),
('Cricket', 0.5705661087708224),
('Toilet_flush', 0.6106988936988939),
('Skateboard', 0.6255098301480656),
('Screaming', 0.6542345876345876),
('Sigh', 0.6605130249867092),
('Knock', 0.6936239316239317),
('Computer_keyboard', 0.6946315536315538),
('Race_car_and_auto_racing', 0.7463410825017968),
('Bass_guitar', 0.7516565656565657),
('Chirp_and_tweet', 0.7577625799782861),
('Burping_and_eructation', 0.7591923076923078),
('Bark', 0.797081975257447),
('Church_bell', 0.8224146601217606),
('Acoustic_guitar', 0.8582222222222223),
('Finger_snapping', 0.8639674399674399),
('Harmonica', 0.890971667898497),
('Bicycle_bell', 0.9373687119955778)]
We haven't looked at the output distributions so we should probably see what ones we're getting wrong. It might also be useful to look at the length of the clips we're getting right vs wrong.