apache / mxnet

Lightweight, Portable, Flexible Distributed/Mobile Deep Learning with Dynamic, Mutation-aware Dataflow Dep Scheduler; for Python, R, Julia, Scala, Go, Javascript and more
https://mxnet.apache.org
Apache License 2.0
20.79k stars 6.79k forks source link

error when try resize on pixel video to perform image-recognition analysis in R #14061

Open jasperDD opened 5 years ago

jasperDD commented 5 years ago

I want to create the predicative model in line with the image processing, I have a lot of video files(.mov) on Google Drive (concerned with auto driving) I get data from Google Drive as links to the files. My disk and data are available for all Internet users.

test

https://drive.google.com/drive/folders/1JidqB3TfHn0Cky8VBXHjbmHu7s0rGLrO?usp=sharing

train

https://drive.google.com/drive/folders/1WIFQIC23_o1__BPmlRDpnYYwmthH2AP-?usp=sharing

library("googledrive")

X=googledrive::drive_ls(path ="test") Label=googledrive::drive_ls(path ="train")

So example structure of dataframe from google disk (If necessary)

dput()

train=structure(list(name = structure(c(1L, 11L, 10L, 9L, 8L, 7L, 6L, 
5L, 4L, 3L, 2L), .Label = c("<chr>", "047a188c-1ac1965c.mov", 
"047a7ecb-68221e4a.mov", "047c278b-452d36f8.mov", "047e715f-3e47a9aa.mov", 
"047e715f-81e81a28.mov", "047e732b-aa79a87d.mov", "0571873b-de675e01.mov", 
"0571873b-faf718b2.mov", "0573e933-a8b4cf7d.mov", "0573f031-8ef23cf6.mov"
), class = "factor"), id = structure(c(1L, 5L, 7L, 11L, 9L, 3L, 
10L, 8L, 4L, 2L, 6L), .Label = c("<chr>", "115rWp3h3Of3Rx61mqDRfhatFMFOpImRf", 
"1EfokXp8UAxYKlpmGAIwU3FRJTTqrgDrS", "1EJa-A0a4_4nVgeF-pBXh6q6DFToGTYFu", 
"1HHML9bo4UPY9r1hIL9igSX_t5FXH5n82", "1HzVTOqRwNfxVDey6EYmDe2nd8hnnTbHT", 
"1IhMQiiCyb_WcKif8qyQmeK1W0tb8iU-U", "1lQc1a0mFw158T9U_QRvgoF0a33xiehZc", 
"1StqEC_7hJO4HJ9uvC0o7sjLLY3tdceNp", "1thEsWrcYFN4qgG57RCUxqCr7WE6ecrmq", 
"1xcxAuHamoFKHCD05wHfdVjeVDEN-FW8C"), class = "factor"), drive_resource = structure(c(2L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("<list [41]>", 
"<list>"), class = "factor")), .Names = c("name", "id", "drive_resource"
), class = "data.frame", row.names = c(NA, -11L))

test=structure(list(name = structure(c(1L, 2L, 4L, 3L, 11L, 10L, 9L, 
8L, 7L, 6L, 5L), .Label = c("<chr>", "ddbd3eb2-ed0fde54.mov", 
"ddbf7bbb-c1908e76.mov", "ddd140cc-c54a4e82.mov", "df94066c-654dcae9.mov", 
"df94066c-b5e96c4c.mov", "df952550-4cb35087.mov", "df9b2e94-c14fc6a7.mov", 
"df9b8801-a11fba46.mov", "df9cc07f-5cec2c16.mov", "dfa06e5c-aa220d9a.mov"
), class = "factor"), id = structure(c(1L, 10L, 2L, 3L, 8L, 6L, 
11L, 7L, 9L, 4L, 5L), .Label = c("<chr>", "18fDVBhfyAHqUffG0GNFGti7549G43bhZ", 
"1aYVn6L7147dDPcOb5CKC3RHh28fS7qix", "1Evm3EotD1xRoljlVCZ3sDIMnEmKaTbO5", 
"1jhbfo3NSKKjbLrMkEh-HRx-UIOUr6R5o", "1kK5AvfwTV_exoWO55dEwEH4QIHaqpVER", 
"1mjr8xSRdULPbmkQN-7L5Dx9yMb_zLxWh", "1OSg6d4q9is80c9Oark6ktdXwvZI8IpER", 
"1Q3UlVeZXDF2cjglqxToapX2FMgRABhA9", "1uIS-Y3N_ipDuzG8kVT5gP3VScAvS-B9_", 
"1yXKCCfgMJVbLqEyTJJCjS_pKQLMnZ6Kp"), class = "factor"), drive_resource = structure(c(2L, 
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("<list [41]>", 
"<list>"), class = "factor")), .Names = c("name", "id", "drive_resource"
), class = "data.frame", row.names = c(NA, -11L))

Now i must resize video data to pixels

require(EBImage)

Dataframe of resized images

rs_df <- data.frame()

Main loop: for each image, resize and set it to greyscale

for(i in 1:nrow(X)) {

Try-catch

result <- tryCatch({

Image (as 1d vector)

img <- as.numeric(X[i,])
# Reshape as a 64x64 image (EBImage object)
img <- Image(img, dim=c(64, 64), colormode = "Grayscale")
# Resize image to 28x28 pixels
img_resized <- resize(img, w = 28, h = 28)
# Get image matrix (there should be another function to do this faster and more neatly!)
img_matrix <- img_resized@.Data
# Coerce to a vector
img_vector <- as.vector(t(img_matrix))
# Add label
label <- labels[i,]
vec <- c(label, img_vector)
# Stack in rs_df using rbind
rs_df <- rbind(rs_df, vec)
# Print status
print(paste("Done",i,sep = " "))},
# Error function (just prints the error). Btw you should get no errors!
error = function(e){print(e)})

}

after i get the list with errors like

<simpleError in labels[i, ]: object of type 'closure' is not subsettable>

next

names(rs_df) <- c("label", paste("pixel", c(1:776))) #776 video .mov files

after here error

Error in names(rs_df) <- c("label", paste("pixel", c(1:776))) : object 'rs_df' not found

Otherwise, I get the same errors concerned with rs_df

set.seed(100)

Shuffled df

shuffled <- rs_df[sample(1:400),] Error: object 'rs_df' not found

Train-test split

train_28 <- shuffled[1:360, ] Error: object 'shuffled' not found test_28 <- shuffled[361:400, ] Error: object 'shuffled' not found

Save train-test datasets

write.csv(train_28, "train_28.csv", row.names = FALSE) Error in is.data.frame(x) : object 'train_28' not found write.csv(test_28, "test_28.csv", row.names = FALSE) Error in is.data.frame(x) : object 'test_28' not found

I think the problem that it incorrectly loads data from the disk, but I could be wrong, maybe the problem is something else.

How do I properly resize the video in pixels, to continue this analysis. Cause when i run this script, i have many error like

Error in t(test[, -1]) : object 'test' not found

Build model

Clean workspace

rm(list=ls())

Load MXNet

library("downloader") library("influenceR") library("rgexf") require(mxnet)

Load train and test datasets

train <- read.csv("train_28.csv") test <- read.csv("test_28.csv")

Set up train and test datasets

train <- data.matrix(train) train_x <- t(train[, -1]) train_y <- train[, 1] train_array <- train_x dim(train_array) <- c(28, 28, 1, ncol(train_x))

test_x <- t(test[, -1]) test_y <- test[, 1] test_array <- test_x dim(test_array) <- c(28, 28, 1, ncol(test_x))

Set up the symbolic model

data <- mx.symbol.Variable('data')

1st convolutional layer

conv_1 <- mx.symbol.Convolution(data = data, kernel = c(5, 5), num_filter = 20) tanh_1 <- mx.symbol.Activation(data = conv_1, act_type = "tanh") pool_1 <- mx.symbol.Pooling(data = tanh_1, pool_type = "max", kernel = c(2, 2), stride = c(2, 2))

2nd convolutional layer

conv_2 <- mx.symbol.Convolution(data = pool_1, kernel = c(5, 5), num_filter = 50) tanh_2 <- mx.symbol.Activation(data = conv_2, act_type = "tanh") pool_2 <- mx.symbol.Pooling(data=tanh_2, pool_type = "max", kernel = c(2, 2), stride = c(2, 2))

1st fully connected layer

flatten <- mx.symbol.Flatten(data = pool_2) fc_1 <- mx.symbol.FullyConnected(data = flatten, num_hidden = 500) tanh_3 <- mx.symbol.Activation(data = fc_1, act_type = "tanh")

2nd fully connected layer

fc_2 <- mx.symbol.FullyConnected(data = tanh_3, num_hidden = 40)

Output. Softmax output since we'd like to get some probabilities.

NN_model <- mx.symbol.SoftmaxOutput(data = fc_2)

Pre-training set up

-------------------------------------------------------------------------------

Set seed for reproducibility

mx.set.seed(100)

Device used. CPU in my case.

devices <- mx.cpu()

Training

-------------------------------------------------------------------------------

Train the model

model <- mx.model.FeedForward.create(NN_model, X = train_array, y = train_y, ctx = devices, num.round = 480, array.batch.size = 40, learning.rate = 0.01, momentum = 0.9, eval.metric = mx.metric.accuracy, epoch.end.callback = mx.callback.log.train.metric(100))

So how correct get data from google disk and resize it on pixels to create my model? any help is important

andrewfayres commented 5 years ago

Thanks for the issue. Adding labels to get better visibility.

@mxnet-label-bot add [R]

jeremiedb commented 5 years ago

Your issue appears more related to converting a video into an array than to mxnet per se. I'd recommend to take a look at the imager package vignette describing handling of video format here: https://dahtah.github.io/imager/imager.html#loading-and-saving-videos. This should get you with 4 dimensionnal arrays: [Width, Height, Time, Colors]

It may be worth considering working with images extracted from the video with 2D convolutions rather than directly with 3D convolutions.

jasperDD commented 5 years ago

jeremiedb, thanks, good suggestion

jasperDD commented 5 years ago

jeremiedb, may i ask you, what place in this link about how convert video to dataframe? like label pixel.1 pixel.2 pixel.3 pixel.4 pixel.5 pixel.6 pixel.7 1 304 304 304 304 304 304 304 304 2 32 32 32 32 32 32 32 32 3 350 351 351 351 351 351 351 351 4 265 265 265 265 265 265 265 265 5 108 108 108 108 108 108 108 108 6 87 87 87 87 87 87 87 87 7 191 192 192 192 192 192 192 192 8 170 170 170 170 170 170 170 170 9 329 329 329 329 329 329 329 329 10 268 268 268 268 268 268 268 268 11 238 238 238 238 238 238 238 238 12 159 159 159 159 159 159 159 159 13 220 221 221 221 221 221 221 221

jasperDD commented 5 years ago

cause here only for image array (parrot)

jeremiedb commented 5 years ago

@jasperDD

Data should be handled as arrays in mxnet. For images, each observation is 3D (HxWxC). C refers to the color channels. To store multiple observations, a 4th dimension is therefore needed. Data fed to the network will be of shape [HxWxCxBatchSize].

For images, you could use the following approach to convert videos in arrays of images of the appropriate format:

library(imager)

fname <- system.file('extdata/tennis_sif.mpeg',package='imager')
tennis <- load.video(fname, frames = 10, fps = 4)
dim(tennis)
tennis_split <- imsplit(tennis, axis = "z")

img_array <- array(dim = c(352, 240, 3, 10))
for (i in 1:10) {
  img_array[,,,i] <- array(tennis_split[[i]], dim = c(352, 240, 3))
}

img_1 <- as.cimg(img_array[,,,1])
dim(img_1)
plot(img_1)

What the above does is to extract 10 frames from the video, at a sample rate of 4 images per second. Then the imsplit is used to create a list of images. The loop is used to create an img_array that is in a compatible format with mxnet. Note that works for tests but will likely be inefficient for training datasets of decent size. Converting the frames from video into jpeg and then converting that collection of jpeg files into a RecordIO through im2rec utility would provide an highly efficient image iterator for training on large image dataset.