JuliaIO / MAT.jl

Julia module for reading MATLAB files
MIT License
278 stars 71 forks source link

should return 1×n/n×1 matlab vector to julia vector, and 1×1 as scaler? #79

Open babaq opened 6 years ago

babaq commented 6 years ago

i think it's more natural for Julia user to anticipate vector instead of 1×n matrix, when loading matlab row/column vectors, and have a scalar value instead of 1×1 matrix.

maybe the MAT package should have options to do these convert?

danielolsen commented 4 years ago

I agree, the squeeze_me flag in scipy.io.loadmat is a great example of enabling this functionality when desired. https://docs.scipy.org/doc/scipy/reference/generated/scipy.io.loadmat.html

babaq commented 4 years ago

I have a function that could partially do it, but it maybe need more testing and improvement. here it is:

"Convert `MATLAB` type to `Julia` type with proper dimention"

function mat2julia!(x;isscaler = true,isvector = true)

    if x isa Dict

        for k in keys(x)

            x[k]=mat2julia!(x[k],isscaler=isscaler,isvector=isvector)

        end

    elseif x isa Array

        if ndims(x)==2

            nr,nc = size(x)

            if nr==1 && nc==1

                x = isscaler ? x[1,1] : isvector ? dropdims(x,dims=2) : x

            elseif nr==1 && nc>1

                x = isvector ? dropdims(x,dims=1) : x

            elseif nr>1 && nc==1

                x = isvector ? dropdims(x,dims=2) : x

            end

        end

        if x isa Array{Any} || x isa Array{Array} || x isa Array{Dict}

            for i in 1:length(x)

                x[i]=mat2julia!(x[i],isscaler=isscaler,isvector=isvector)

            end

        elseif x isa Dict

            for k in keys(x)

                x[k]=mat2julia!(x[k],isscaler=isscaler,isvector=isvector)

            end

        end

    end

    return x

end
briochemc commented 4 years ago

I'm not sure I understand here but it seems to me that it is better to keep the original dimensions. For example, a row vector (1×n array) has different behavior to a column vector (e.g., it multiplies naturally n×n array on the left), so it should be kept as a row vector in Julia, too, right?

Anyway if you need to drop singleton dimensions, you can use dropdims (which @babaq is already using in his snippet), e.g., with

dropdims(x, dims=Tuple(findall(size(x) .== 1)...,))

As for the function suggested, I would use dispatch on the types, and do something more like

f(x::Dict) = [x[k] = f(x[k]) for k in keys(x)]
f(x) = x # fallback default
f(x::Vector) = x
f(x::Array) = dropdims(x, dims=Tuple(findall(size(x) .== 1)...,))

to make it shorter and more explicit what it's doing.

(Sorry if I misunderstood all of this thouggh, which is likely.)

babaq commented 4 years ago

I think @danielolsen and I were proposing that it would be convenient to optionally reduce 1xn or nx1 to vector and 1x1 to scaler if users want to or know what they are doing(aware of vector/matrix multiplication, etc ).

I like the multi-dispatch approach, and maybe need two specific keywords for reduce rowvector 1xn, and columnvector nx1

@timholy is this a favorable feature to add?

babaq commented 4 years ago

here is the revised functions to reduce 1xn or nx1 to vector and 1x1 to scaler.

"""
Drop matrix dims according to `MATLAB` convention.

- scaler: scaler instead of 1x1 matrix
- rvector: vector instead of 1xN matrix
- cvector: vector instead of Nx1 matrix
"""
dropmatdim!(x;scaler = true, rvector = true, cvector = true) = x
function dropmatdim!(x::Dict;scaler = true, rvector = true, cvector = true)
    foreach(k->x[k]=dropmatdim!(x[k],scaler=scaler,rvector=rvector,cvector=cvector),keys(x))
    return x
end
function dropmatdim!(x::Array;scaler = true, rvector = true, cvector = true)
    if ndims(x) == 2
        nr,nc = size(x)
        if nr==1 && nc==1 && scaler
            return dropmatdim!(x[1,1],scaler=scaler,rvector=rvector,cvector=cvector)
        elseif nr==1 && nc>1 && rvector
            x = dropdims(x,dims=1)
        elseif nr>1 && nc==1 && cvector
            x = dropdims(x,dims=2)
        end
    end
    if eltype(x) in [Any,Array,Dict]
        foreach(i->x[i]=dropmatdim!(x[i],scaler=scaler,rvector=rvector,cvector=cvector),eachindex(x))
    end
    return x
end
briochemc commented 4 years ago

I'm still not sure I understand the conditional tree you are building in your function. I would also not use the exclamation mark since you cannot squeeze in place. Anyway, here is the tests and functions (just 3 lines!) that I would use (you should write up a series of tests like that so that it is clear what you want):

# function to squeeze
f(x) = x # fallback default
f(x::Array) = all(size(x) .== 1) ? f(x[1]) : dropdims(f.(x), dims=Tuple(findall(size(x) .== 1)))
f(x::Dict) = Dict((k,f(v)) for (k,v) in pairs(x))

# Variables for testing
x0 = "foo"
x1 = rand(3)
x2 = rand(3,1)
x3 = rand(1,3)
x4 = rand(1,1)
x5 = rand(3,1,3,1,3)
x6 = [x0, x1, x2, x3, x4, x5]
x = Dict(:x0=>x0, :x1=>x1, :x2=>x2, :x3=>x3, :x4=>x4, :x5=>x5) # Dict of arrays and strings
y = reshape([x, x0, x1, x6], (1,4))                            # Array of dicts and arrays and strings
z = Dict(:y=>y, :x=>x, :x6=>x6)                                # Dict of dicts and arrays and strings

# Some tests to check that all the arrays are squeezed recursively
["$k: size conversion: $(size(v)) → $(size(f(v))))" for (k,v) in x if v isa Array]
f(z)[:y]         # is a vector
f(z)[:y][1][:x5] # is a (3,3,3) array

which I think is what you are looking for.

PS: I think you mean "scalar" instead of "scaler"

babaq commented 4 years ago

@briochemc Thanks for checking the code and yes, I mean "scalar" instead of "scaler".

The reason I add exclamation mark is that I meant to "in place" drop dims of the data read by the MAT. Because the data i need to handle are quite large(~1GB), so i do in place process for Dict, Array{Any}, etc..

I originally just want to deal with "everything is a matrix" in MATLAB, so I only process for matrix that wrap a scalar, or row/column vector. if a 1x1x1 array is found, i wouldn't consider it meant to represent a scalar, same for a 3x3x1x7. i think it's debatable what type of squeeze is appropriate.

as for specifically deal with Array of Any, Array and Dict, one reason is for performance, the other is these are the data container types I can think of MATLAB use to save data, which probably need more thought.

I tried the test you provided, if data are small, performance is quite comparable, but for large data, it has significant difference.


using BenchmarkTools

# Variables for testing
x0 = "foo"
x1 = rand(3000)
x2 = rand(3000,1)
x3 = rand(1,3000)
x4 = rand(1,1)
x5 = rand(3000,1,30,1,3)
x6 = [x0, x1, x2, x3, x4, x5]
x = Dict(:x0=>x0, :x1=>x1, :x2=>x2, :x3=>x3, :x4=>x4, :x5=>x5) # Dict of arrays and strings
y = reshape([x, x0, x1, x6], (1,4))                            # Array of dicts and arrays and strings
z = Dict(:y=>y, :x=>x, :x6=>x6)                                # Dict of dicts and arrays and strings

@btime f($(z))
# 3.393 ms (350 allocations: 8.55 MiB)
@btime dropmatdim!($(z))
# 16.299 μs (165 allocations: 5.80 KiB)
briochemc commented 4 years ago

Oh I did not realize this was performance critical. Please try with either of f2 or f3 below then, which deal with Dicts "in place" and don't do extra work when there are no singleton dimensions. (Difference between f2 and f3 is just inline and a bit obfuscated vs a clean multi-line if-then-else statement that is longer but easier to read I guess)

# function to squeeze
f2(x) = x # fallback default
function f2(x::Array)
    if all(size(x) .== 1)
        f2(x[1])
    elseif any(size(x) .== 1)
        dropdims(f2.(x), dims=Tuple(findall()))
    else
        x
    end
end
function f2(x::Dict)
    for (k,v) in pairs(x)
        x[k] = f2(v)
    end
    return x
end

f3(x) = x # fallback default
f3(x::Array) = all(size(x) .== 1) ? f3(x[1]) : any(size(x) .== 1) ? dropdims(f3.(x), dims=Tuple(findall())) : x
f3(x::Dict) = (for (k,v) in pairs(x); x[k] = f3(v) end; x)

This is what I get:

julia> @btime f($(z)) ;
  724.776 μs (216 allocations: 8.55 MiB)

julia> @btime f2($(z)) ;
  810.080 ns (1 allocation: 16 bytes)

julia> @btime f3($(z)) ;
  807.966 ns (1 allocation: 16 bytes)

julia> @btime dropmatdim!($(z)) ;
  16.507 μs (165 allocations: 5.80 KiB)
babaq commented 4 years ago

the f2 and f3 does not handle Array of [Any,Dict,Array] when no dims dropped. see the result of z[:x7]

# Variables for testing
x0 = "foo"
x1 = rand(3000)
x2 = rand(3000,1)
x3 = rand(1,3000)
x4 = rand(1,1)
x5 = rand(3000,1,30,1,3)
x6 = [x0, x1, x2, x3, x4, x5]
x7 = fill(x4,200,20,30)
x = Dict(:x0=>x0, :x1=>x1, :x2=>x2, :x3=>x3, :x4=>x4, :x5=>x5) # Dict of arrays and strings
y = reshape([x, x0, x1, x6], (1,4))                            # Array of dicts and arrays and strings
z = Dict(:y=>y, :x=>x, :x6=>x6, :x7=>x7) 
briochemc commented 4 years ago

I'm not sure I understand, although maybe there was a typo and I should apply f2 (and f3) to the case where no dims are dropped. E.g., with

f3(x) = x # fallback default
f3(x::Array) = all(size(x) .== 1) ? f3(x[1]) : any(size(x) .== 1) ? dropdims(f3.(x), dims=Tuple(findall(size(x) .== 1))) : f3.(x)
f3(x::Dict) = (for (k,v) in pairs(x); x[k] = f3(v) end; x)

But could you show the Julia code that does not work? Because I see that for your function, too:

julia> typeof(dropmatdim!(x7))
Array{Array{Float64,2},3}

EDIT: Oh wait I see that

julia> typeof(dropmatdim!(z)[:x7])
Array{Float64,3}

so something else is going on


EDIT 2: Oh I see that I should also have made the recursive call to f3 be in place, so try

f3(x) = x # fallback default
f3(x::Array) = all(size(x) .== 1) ? f3(x[1]) : any(size(x) .== 1) ? dropdims(f3.(x), dims=Tuple(findall(size(x) .== 1))) : x .= f3.(x)
f3(x::Dict) = (for (k,v) in pairs(x); x[k] = f3(v) end; x)

which gives the same output on z[:x7] but is faster

julia> typeof(z[:x7])
Array{Float64,3}

julia> typeof(f3(z)[:x7])
Array{Float64,3}

julia> @btime f3($(z)) ;
  2.259 μs (2 allocations: 32 bytes)

julia> @btime dropmatdim!($(z)) ;
  16.365 μs (172 allocations: 6.06 KiB)

EDIT 3: Arf no I messed up somewhere... I keep modifying things in place now so my tests are wrong! I'll try again...

briochemc commented 4 years ago

Ok so I don't think dropmatdims! does the right thing here either. Protecting z from being changed by always copying it into a ztest before applying the functions that might change them, I see that:

julia> ztest = copy(z) ;

julia> typeof(dropmatdim!(ztest)[:x7])
Array{Array{Float64,2},3}
babaq commented 4 years ago

yes, it's a bug, here is the new version.

"""
Drop matrix dims according to `MATLAB` convention.

- scalar: scalar instead of 1x1 matrix
- rvector: vector instead of 1xN matrix
- cvector: vector instead of Nx1 matrix
"""
dropmatdim!(x;scalar = true, rvector = true, cvector = true) = x
dropmatdim!(x::Dict;scalar = true, rvector = true, cvector = true) = Dict(k=>dropmatdim!(x[k],scalar=scalar,rvector=rvector,cvector=cvector) for k in keys(x))
function dropmatdim!(x::Array;scalar = true, rvector = true, cvector = true)
    if ndims(x) == 2
        nr,nc = size(x)
        if nr==1 && nc==1 && scalar
            return dropmatdim!(x[1,1],scalar=scalar,rvector=rvector,cvector=cvector)
        elseif nr==1 && nc>1 && rvector
            x = dropdims(x,dims=1)
        elseif nr>1 && nc==1 && cvector
            x = dropdims(x,dims=2)
        end
    end
    if x isa Array{Any}
        foreach(i->x[i]=dropmatdim!(x[i],scalar=scalar,rvector=rvector,cvector=cvector),eachindex(x))
    elseif any((<:).(eltype(x),[Array,Dict]))
        x = dropmatdim!.(x,scalar=scalar,rvector=rvector,cvector=cvector)
    end
    return x
end

I think the Array{Any} and Array of Array/Dict should be treated differently, since reduce to scalar introduce type instability, which can not guarantied to be done in place(x8, x9 below).

# Variables for testing
x0 = "foo"
x1 = rand(3000)
x2 = rand(3000,1)
x3 = rand(1,3000)
x4 = rand(1,1)
x5 = rand(3000,1,30,1,3)
x6 = [x0, x1, x2, x3, x4, x5]
x7 = fill(x4,200,20,30)
x8 = Dict(:a=>x4,:b=>x4)
x9 = Any[x4,x4]
x = Dict(:x0=>x0, :x1=>x1, :x2=>x2, :x3=>x3, :x4=>x4, :x5=>x5) # Dict of arrays and strings
y = reshape([x, x0, x1, x6], (1,4))                            # Array of dicts and arrays and strings
z = Dict(:y=>y, :x=>x, :x6=>x6, :x7=>x7, :x8=>x8, :x9=>x9)                                # Dict of dicts and arrays and strings

from what I know, MATLAB actually can't save data that result in a Julia Array{Array}, it can only be Array{Any} from MATLAB cell array. I am not sure about Array{Dict}, but the MATLAB struct array seems be read into a Julia Dict by MAT.

babaq commented 4 years ago

I tried to save some data(cell array of size 1x1, homological struct where all value is of double, heterological struct where values can be array or string, struct array of same field names) from MATLAB in v7.3 format, and it seems confirmed my guess that there are no Array{Array} and Array{Dict} from MAT, and all Dict is of type Dict{String, Any}, so the functions could be:

"""
Drop matrix dims according to `MATLAB` convention.

- scalar: scalar instead of 1x1 matrix
- rvector: vector instead of 1xN matrix
- cvector: vector instead of Nx1 matrix
"""
dropmatdim!(x;scalar = true, rvector = true, cvector = true) = x
function dropmatdim!(x::Dict;scalar = true, rvector = true, cvector = true)
    foreach(k->x[k]=dropmatdim!(x[k],scalar=scalar,rvector=rvector,cvector=cvector),keys(x))
    return x
 end
function dropmatdim!(x::Array;scalar = true, rvector = true, cvector = true)
    if ndims(x) == 2
        nr,nc = size(x)
        if nr==1 && nc==1 && scalar
            return dropmatdim!(x[1,1],scalar=scalar,rvector=rvector,cvector=cvector)
        elseif nr==1 && nc>1 && rvector
            x = dropdims(x,dims=1)
        elseif nr>1 && nc==1 && cvector
            x = dropdims(x,dims=2)
        end
    end
    if x isa Array{Any}
        foreach(i->x[i]=dropmatdim!(x[i],scalar=scalar,rvector=rvector,cvector=cvector),eachindex(x))
    end
    return x
end