`mark_start()`/`mark_end()` sometimes break autovectorization

Adding mark_start() to the tight inner loop here:

@inline function scorep1(opp, me)
    isdraw = opp == me
    iswin  = (opp+0x1 == me) | (me+0x2 == opp)
    me + (0x3*isdraw) + (0x6*iswin)
end

@inline function scorep2(opp, target)
    mychoice = mod1(opp + mod1(target+0x1, 0x3), 0x3)
    mychoice + 0x3*(target-0x1)
end

solve(file::String) = solve(read(file))
function solve(data, f::F=scorep1) where F
    l = length(data)
    acc = UInt16(0)
    @inbounds @simd for idx in 1:4:l
        opp = data[idx + 0] - UInt8('A') + 0x1
        me  = data[idx + 2] - UInt8('X') + 0x1
        acc += f(opp, me)
    end
    acc
end

Breaks vectorization pretty badly. It goes from happily using lots of xmm to only using eax & friends. I just wanted to know how much performance was still left on the table, which is kind of hard to do when the tool breaks the vectorization. I don't yet know how, so this issue is just here for tracking this in general, but it ought to be possible to have our cake & eat it too here.

JuliaPerf / MCAnalyzer.jl

`mark_start()`/`mark_end()` sometimes break autovectorization #30