unified memory implementation

I see what you did with auto generated code. I still think it would be convenient to be able to pass a generic class as an argument.
What about this approach: `` ///
/// A variable located in managed memory. /// Type: generic ///
public abstract class CudaManagedMemory : IDisposable, IEnumerable where T : struct { protected CUdeviceptr _devPtr; SizeT _size = 0; SizeT _typeSize = 0; CUResult res; bool disposed; bool _isOwner;
    #region Constructor
    /// <summary>
    /// Creates a new CudaManagedMemory and allocates the memory on host/device.
    /// </summary>
    /// <param name="size">In elements</param>
    /// <param name="attachFlags"></param>
    protected CudaManagedMemory(SizeT size, CUmemAttach_flags attachFlags)
    {
        _devPtr = new CUdeviceptr();
        _size = size;
        _typeSize = (SizeT)Marshal.SizeOf(typeof(byte));

        res = DriverAPINativeMethods.MemoryManagement.cuMemAllocManaged(ref _devPtr, _typeSize * size, attachFlags);
        Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemAllocManaged", res));
        if (res != CUResult.Success) throw new CudaException(res);
        _isOwner = true;
    }

    /// <summary>
    /// Creates a new CudaManagedMemory from definition in cu-file.
    /// </summary>
    /// <param name="module">The module where the variable is defined in.</param>
    /// <param name="name">The variable name as defined in the cu-file.</param>
    protected CudaManagedMemory(CUmodule module, string name)
    {
        _devPtr = new CUdeviceptr();
        SizeT _sizeInBytes = new SizeT();
        res = DriverAPINativeMethods.ModuleManagement.cuModuleGetGlobal_v2(ref _devPtr, ref _sizeInBytes, module, name);
        Debug.WriteLine(String.Format("{0:G}, {1}: {2}. Name: {3}, Size (in bytes): {4}", DateTime.Now, "cuModuleGetGlobal_v2", res, name, _sizeInBytes.ToString()));
        if (res != CUResult.Success) throw new CudaException(res);

        _typeSize = (SizeT)Marshal.SizeOf(typeof(byte));
        _size = _sizeInBytes / _typeSize;

        if (_sizeInBytes != _size * _typeSize)
            throw new CudaException("Variable size is not a multiple of its type size.");

        _isOwner = false;
    }

    /// <summary>
    /// Creates a new CudaManagedMemory from definition in cu-file.
    /// </summary>
    /// <param name="kernel">The kernel which module defines the variable.</param>
    /// <param name="name">The variable name as defined in the cu-file.</param>
    protected CudaManagedMemory(CudaKernel kernel, string name)
        : this(kernel.CUModule, name)
    {

    }

    /// <summary>
    /// For dispose
    /// </summary>
    ~CudaManagedMemory()
    {
        Dispose(false);
    }
    #endregion

    #region Dispose
    /// <summary>
    /// Dispose
    /// </summary>
    public void Dispose()
    {
        Dispose(true);
        GC.SuppressFinalize(this);
    }

    /// <summary>
    /// For IDisposable
    /// </summary>
    /// <param name="fDisposing"></param>
    protected virtual void Dispose(bool fDisposing)
    {
        if (fDisposing && !disposed)
        {
            if (_isOwner)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFree_v2(_devPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFree_v2", res));
            }
            disposed = true;
        }
        if (!fDisposing && !disposed)
            Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
    }
    #endregion

    #region Properties
    /// <summary>
    /// UIntPtr to managed memory.
    /// </summary>
    public UIntPtr HostPointer
    {
        get { return _devPtr.Pointer; }
    }

    /// <summary>
    /// CUdeviceptr to managed memory.
    /// </summary>
    public CUdeviceptr DevicePointer
    {
        get { return _devPtr; }
    }

    /// <summary>
    /// Size in bytes
    /// </summary>
    public SizeT SizeInBytes
    {
        get { return _size * _typeSize; }
    }

    /// <summary>
    /// Size in elements
    /// </summary>
    public SizeT Size
    {
        get { return _size; }
    }

    /// <summary>
    /// Access array per element.
    /// </summary>
    /// <param name="index">index in elements</param>
    /// <returns></returns>
    public virtual T this[SizeT index]
    {
        get
        {
            //MIKE return _ptr[index];
            return default(T);
        }
        set
        {
            //MIKE _ptr[index] = value;
        }
    }

    /// <summary>
    /// If the wrapper class instance is the owner of a CUDA handle, it will be destroyed while disposing.
    /// </summary>
    public bool IsOwner
    {
        get { return _isOwner; }
    }
    #endregion

    #region Converter operators
    /// <summary>
    /// Converts a managed variable to a host value. In case of multiple managed values (array), only the first value is converted.
    /// </summary>
    /// <param name="d">managed variable</param>
    /// <returns>newly allocated host variable with value from managed memory</returns>
    public static implicit operator T(CudaManagedMemory<T> d) //MIKE
    {
        return d[0];
    }
    #endregion

    #region GetAttributeMethods
    /// <summary>
    /// The <see cref="CUcontext"/> on which a pointer was allocated or registered
    /// </summary>
    public CUcontext AttributeContext
    {
        get
        {
            CUcontext ret = new CUcontext();
            CUResult res = DriverAPINativeMethods.MemoryManagement.cuPointerGetAttribute(ref ret, CUPointerAttribute.Context, _devPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuPointerGetAttribute", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ret;
        }
    }

    /// <summary>
    /// The <see cref="CUMemoryType"/> describing the physical location of a pointer 
    /// </summary>
    public CUMemoryType AttributeMemoryType
    {
        get
        {
            CUMemoryType ret = new CUMemoryType();
            CUResult res = DriverAPINativeMethods.MemoryManagement.cuPointerGetAttribute(ref ret, CUPointerAttribute.MemoryType, _devPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuPointerGetAttribute", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ret;
        }
    }

    /// <summary>
    /// The address at which a pointer's memory may be accessed on the device <para/>
    /// Except in the exceptional disjoint addressing cases, the value returned will equal the input value.
    /// </summary>
    public CUdeviceptr AttributeDevicePointer
    {
        get
        {
            CUdeviceptr ret = new CUdeviceptr();
            CUResult res = DriverAPINativeMethods.MemoryManagement.cuPointerGetAttribute(ref ret, CUPointerAttribute.DevicePointer, _devPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuPointerGetAttribute", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ret;
        }
    }

    /// <summary>
    /// The address at which a pointer's memory may be accessed on the host 
    /// </summary>
    public IntPtr AttributeHostPointer
    {
        get
        {
            IntPtr ret = new IntPtr();
            CUResult res = DriverAPINativeMethods.MemoryManagement.cuPointerGetAttribute(ref ret, CUPointerAttribute.HostPointer, _devPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuPointerGetAttribute", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ret;
        }
    }

    /// <summary>
    /// A pair of tokens for use with the nv-p2p.h Linux kernel interface
    /// </summary>
    public CudaPointerAttributeP2PTokens AttributeP2PTokens
    {
        get
        {
            CudaPointerAttributeP2PTokens ret = new CudaPointerAttributeP2PTokens();
            CUResult res = DriverAPINativeMethods.MemoryManagement.cuPointerGetAttribute(ref ret, CUPointerAttribute.P2PTokens, _devPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuPointerGetAttribute", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ret;
        }
    }

    /// <summary>
    /// Synchronize every synchronous memory operation initiated on this region
    /// </summary>
    public bool AttributeSyncMemops
    {
        get
        {
            int ret = 0;
            CUResult res = DriverAPINativeMethods.MemoryManagement.cuPointerGetAttribute(ref ret, CUPointerAttribute.SyncMemops, _devPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuPointerGetAttribute", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ret != 0;
        }
        set
        {
            int val = value ? 1 : 0;
            CUResult res = DriverAPINativeMethods.MemoryManagement.cuPointerSetAttribute(ref val, CUPointerAttribute.SyncMemops, _devPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuPointerSetAttribute", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
    }

    /// <summary>
    /// A process-wide unique ID for an allocated memory region
    /// </summary>
    public ulong AttributeBufferID
    {
        get
        {
            ulong ret = 0;
            CUResult res = DriverAPINativeMethods.MemoryManagement.cuPointerGetAttribute(ref ret, CUPointerAttribute.BufferID, _devPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuPointerGetAttribute", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ret;
        }
    }

    /// <summary>
    /// Indicates if the pointer points to managed memory
    /// </summary>
    public bool AttributeIsManaged
    {
        get
        {
            int ret = 0;
            CUResult res = DriverAPINativeMethods.MemoryManagement.cuPointerGetAttribute(ref ret, CUPointerAttribute.IsManaged, _devPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuPointerGetAttribute", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ret != 0;
        }
    }
    #endregion

    #region Methods
    /// <summary>
    /// Attach memory to a stream asynchronously
    /// <para/>
    /// Enqueues an operation in <c>hStream</c> to specify stream association of
    /// <c>length</c> bytes of memory starting from <c>dptr</c>. This function is a
    /// stream-ordered operation, meaning that it is dependent on, and will
    /// only take effect when, previous work in stream has completed. Any
    /// previous association is automatically replaced.
    /// <para/>
    /// <c>dptr</c> must point to an address within managed memory space declared
    /// using the __managed__ keyword or allocated with cuMemAllocManaged.
    /// <para/>
    /// <c>length</c> must be zero, to indicate that the entire allocation's
    /// stream association is being changed. Currently, it's not possible
    /// to change stream association for a portion of an allocation.
    /// <para/>
    /// The stream association is specified using <c>flags</c> which must be
    /// one of <see cref="CUmemAttach_flags"/>.
    /// If the <see cref="CUmemAttach_flags.Global"/> flag is specified, the memory can be accessed
    /// by any stream on any device.
    /// If the <see cref="CUmemAttach_flags.Host"/> flag is specified, the program makes a guarantee
    /// that it won't access the memory on the device from any stream.
    /// If the <see cref="CUmemAttach_flags.Single"/> flag is specified, the program makes a guarantee
    /// that it will only access the memory on the device from <c>hStream</c>. It is illegal
    /// to attach singly to the NULL stream, because the NULL stream is a virtual global
    /// stream and not a specific stream. An error will be returned in this case.
    /// <para/>
    /// When memory is associated with a single stream, the Unified Memory system will
    /// allow CPU access to this memory region so long as all operations in <c>hStream</c>
    /// have completed, regardless of whether other streams are active. In effect,
    /// this constrains exclusive ownership of the managed memory region by
    /// an active GPU to per-stream activity instead of whole-GPU activity.
    /// <para/>
    /// Accessing memory on the device from streams that are not associated with
    /// it will produce undefined results. No error checking is performed by the
    /// Unified Memory system to ensure that kernels launched into other streams
    /// do not access this region. 
    /// <para/>
    /// It is a program's responsibility to order calls to <see cref="DriverAPINativeMethods.Streams.cuStreamAttachMemAsync"/>
    /// via events, synchronization or other means to ensure legal access to memory
    /// at all times. Data visibility and coherency will be changed appropriately
    /// for all kernels which follow a stream-association change.
    /// <para/>
    /// If <c>hStream</c> is destroyed while data is associated with it, the association is
    /// removed and the association reverts to the default visibility of the allocation
    /// as specified at cuMemAllocManaged. For __managed__ variables, the default
    /// association is always <see cref="CUmemAttach_flags.Global"/>. Note that destroying a stream is an
    /// asynchronous operation, and as a result, the change to default association won't
    /// happen until all work in the stream has completed.
    /// <para/>
    /// </summary>
    /// <param name="hStream">Stream in which to enqueue the attach operation</param>
    /// <param name="length">Length of memory (must be zero)</param>
    /// <param name="flags">Must be one of <see cref="CUmemAttach_flags"/></param>
    /// <returns></returns>
    public void StreamAttachMemAsync(CUstream hStream, SizeT length, CUmemAttach_flags flags)
    {
        if (disposed) throw new ObjectDisposedException(this.ToString());
        res = DriverAPINativeMethods.Streams.cuStreamAttachMemAsync(hStream, _devPtr, length, flags);
        Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuStreamAttachMemAsync", res));
        if (res != CUResult.Success) throw new CudaException(res);
    }

    /// <summary>
    /// Prefetches memory to the specified destination device<para/>
    /// Prefetches memory to the specified destination device. devPtr is the 
    /// base device pointer of the memory to be prefetched and dstDevice is the 
    /// destination device. count specifies the number of bytes to copy. hStream
    /// is the stream in which the operation is enqueued.<para/>
    /// 
    /// Passing in CU_DEVICE_CPU for dstDevice will prefetch the data to CPU memory.<para/>
    /// 
    /// If no physical memory has been allocated for this region, then this memory region
    /// will be populated and mapped on the destination device. If there's insufficient
    /// memory to prefetch the desired region, the Unified Memory driver may evict pages
    /// belonging to other memory regions to make room. If there's no memory that can be
    /// evicted, then the Unified Memory driver will prefetch less than what was requested.<para/>
    /// 
    /// In the normal case, any mappings to the previous location of the migrated pages are
    /// removed and mappings for the new location are only setup on the dstDevice.
    /// The application can exercise finer control on these mappings using ::cudaMemAdvise.
    /// </summary>
    /// <param name="dstDevice">Destination device to prefetch to</param>
    /// <param name="hStream">Stream to enqueue prefetch operation</param>
    /// <remarks>Note that this function is asynchronous with respect to the host and all work on other devices.</remarks>
    public void PrefetchAsync(CUdevice dstDevice, CUstream hStream)
    {
        if (disposed) throw new ObjectDisposedException(this.ToString());
        res = DriverAPINativeMethods.MemoryManagement.cuMemPrefetchAsync(_devPtr, _size * _typeSize, dstDevice, hStream);
        Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemPrefetchAsync", res));
        if (res != CUResult.Success) throw new CudaException(res);
    }

    #endregion

    #region IEnumerable
    IEnumerator<T> IEnumerable<T>.GetEnumerator()
    {
        if (disposed) throw new ObjectDisposedException(this.ToString());
        IEnumerator<T> enumerator = new CudaManagedMemoryEnumerator<T>(this);
        return enumerator;
    }

    IEnumerator IEnumerable.GetEnumerator()
    {
        if (disposed) throw new ObjectDisposedException(this.ToString());
        IEnumerator enumerator = new CudaManagedMemoryEnumerator<T>(this);
        return enumerator;
    }

    #endregion

    #region static methods (new in Cuda 8.0)
    /// <summary>
    /// Advise about the usage of a given memory range<para/>
    /// Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes.<para/>
    /// <para/>
    /// The \p advice parameter can take the following values:<para/>
    /// - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read
    /// from and only occasionally written to. This allows the driver to create read-only
    /// copies of the data in a processor's memory when that processor accesses it. Similarly,
    /// if cuMemPrefetchAsync is called on this region, it will create a read-only copy of
    /// the data on the destination processor. When a processor writes to this data, all copies
    /// of the corresponding page are invalidated except for the one where the write occurred.
    /// The \p device argument is ignored for this advice.<para/>
    /// - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read
    /// duplicated copies of the data will be freed no later than the next write access to that data.<para/>
    /// - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the
    /// data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the
    /// preferred location as CPU memory. Setting the preferred location does not cause data to
    /// migrate to that location immediately. Instead, it guides the migration policy when a fault
    /// occurs on that memory region. If the data is already in its preferred location and the
    /// faulting processor can establish a mapping without requiring the data to be migrated, then
    /// the migration will be avoided. On the other hand, if the data is not in its preferred location
    /// or if a direct mapping cannot be established, then it will be migrated to the processor accessing
    /// it. It is important to note that setting the preferred location does not prevent data prefetching
    /// done using ::cuMemPrefetchAsync.<para/>
    /// Having a preferred location can override the thrash detection and resolution logic in the Unified
    /// Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU
    /// memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But
    /// if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely.
    /// When the Unified Memory driver has to evict pages from a certain location on account of that
    /// memory being oversubscribed, the preferred location will be used to decide the destination to which
    /// a page should be evicted to.<para/>
    /// If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred
    /// location will be ignored for that subset.<para/>
    /// - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION
    /// and changes the preferred location to none.<para/>
    /// - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device.
    /// This does not cause data migration and has no impact on the location of the data per se. Instead,
    /// it causes the data to always be mapped in the specified processor's page tables, as long as the
    /// location of the data permits a mapping to be established. If the data gets migrated for any reason,
    /// the mappings are updated accordingly.<para/>
    /// This advice is useful in scenarios where data locality is not important, but avoiding faults is.
    /// Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the
    /// data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data
    /// over to the other GPUs is not as important because the accesses are infrequent and the overhead of
    /// migration may be too high. But preventing faults can still help improve performance, and so having
    /// a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated
    /// to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the
    /// ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the
    /// page in CPU memory.<para/>
    /// - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of
    /// mappings may be removed at any time causing accesses to result in page faults.
    /// <para/>
    /// Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU.
    /// <para/>
    /// Note that this function is asynchronous with respect to the host and all work
    /// on other devices.
    /// </summary>
    /// <param name="devPtr">Pointer to memory to set the advice for</param>
    /// <param name="count">Size in bytes of the memory range</param>
    /// <param name="advice">Advice to be applied for the specified memory range</param>
    /// <param name="device">Device to apply the advice for</param>
    public static void MemAdvise(CUdeviceptr devPtr, SizeT count, CUmemAdvise advice, CUdevice device)
    {
        CUResult res = DriverAPINativeMethods.MemoryManagement.cuMemAdvise(devPtr, count, advice, device);
        Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemAdvise", res));
        if (res != CUResult.Success) throw new CudaException(res);
    }

    /// <summary>
    /// Advise about the usage of a given memory range<para/>
    /// Advise the Unified Memory subsystem about the usage pattern for the memory range starting at devPtr with a size of count bytes.<para/>
    /// <para/>
    /// The \p advice parameter can take the following values:<para/>
    /// - ::CU_MEM_ADVISE_SET_READ_MOSTLY: This implies that the data is mostly going to be read
    /// from and only occasionally written to. This allows the driver to create read-only
    /// copies of the data in a processor's memory when that processor accesses it. Similarly,
    /// if cuMemPrefetchAsync is called on this region, it will create a read-only copy of
    /// the data on the destination processor. When a processor writes to this data, all copies
    /// of the corresponding page are invalidated except for the one where the write occurred.
    /// The \p device argument is ignored for this advice.<para/>
    /// - ::CU_MEM_ADVISE_UNSET_READ_MOSTLY: Undoes the effect of ::CU_MEM_ADVISE_SET_READ_MOSTLY. Any read
    /// duplicated copies of the data will be freed no later than the next write access to that data.<para/>
    /// - ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION: This advice sets the preferred location for the
    /// data to be the memory belonging to \p device. Passing in CU_DEVICE_CPU for \p device sets the
    /// preferred location as CPU memory. Setting the preferred location does not cause data to
    /// migrate to that location immediately. Instead, it guides the migration policy when a fault
    /// occurs on that memory region. If the data is already in its preferred location and the
    /// faulting processor can establish a mapping without requiring the data to be migrated, then
    /// the migration will be avoided. On the other hand, if the data is not in its preferred location
    /// or if a direct mapping cannot be established, then it will be migrated to the processor accessing
    /// it. It is important to note that setting the preferred location does not prevent data prefetching
    /// done using ::cuMemPrefetchAsync.<para/>
    /// Having a preferred location can override the thrash detection and resolution logic in the Unified
    /// Memory driver. Normally, if a page is detected to be constantly thrashing between CPU and GPU
    /// memory say, the page will eventually be pinned to CPU memory by the Unified Memory driver. But
    /// if the preferred location is set as GPU memory, then the page will continue to thrash indefinitely.
    /// When the Unified Memory driver has to evict pages from a certain location on account of that
    /// memory being oversubscribed, the preferred location will be used to decide the destination to which
    /// a page should be evicted to.<para/>
    /// If ::CU_MEM_ADVISE_SET_READ_MOSTLY is also set on this memory region or any subset of it, the preferred
    /// location will be ignored for that subset.<para/>
    /// - ::CU_MEM_ADVISE_UNSET_PREFERRED_LOCATION: Undoes the effect of ::CU_MEM_ADVISE_SET_PREFERRED_LOCATION
    /// and changes the preferred location to none.<para/>
    /// - ::CU_MEM_ADVISE_SET_ACCESSED_BY: This advice implies that the data will be accessed by \p device.
    /// This does not cause data migration and has no impact on the location of the data per se. Instead,
    /// it causes the data to always be mapped in the specified processor's page tables, as long as the
    /// location of the data permits a mapping to be established. If the data gets migrated for any reason,
    /// the mappings are updated accordingly.<para/>
    /// This advice is useful in scenarios where data locality is not important, but avoiding faults is.
    /// Consider for example a system containing multiple GPUs with peer-to-peer access enabled, where the
    /// data located on one GPU is occasionally accessed by other GPUs. In such scenarios, migrating data
    /// over to the other GPUs is not as important because the accesses are infrequent and the overhead of
    /// migration may be too high. But preventing faults can still help improve performance, and so having
    /// a mapping set up in advance is useful. Note that on CPU access of this data, the data may be migrated
    /// to CPU memory because the CPU typically cannot access GPU memory directly. Any GPU that had the
    /// ::CU_MEM_ADVISE_SET_ACCESSED_BY flag set for this data will now have its mapping updated to point to the
    /// page in CPU memory.<para/>
    /// - ::CU_MEM_ADVISE_UNSET_ACCESSED_BY: Undoes the effect of CU_MEM_ADVISE_SET_ACCESSED_BY. The current set of
    /// mappings may be removed at any time causing accesses to result in page faults.
    /// <para/>
    /// Passing in ::CU_DEVICE_CPU for \p device will set the advice for the CPU.
    /// <para/>
    /// Note that this function is asynchronous with respect to the host and all work
    /// on other devices.
    /// </summary>
    /// <param name="ptr">managed memory variable</param>
    /// <param name="advice">Advice to be applied for the specified memory range</param>
    /// <param name="device">Device to apply the advice for</param>
    public static void MemAdvise(CudaManagedMemory_byte ptr, CUmemAdvise advice, CUdevice device)
    {
        CUResult res = DriverAPINativeMethods.MemoryManagement.cuMemAdvise(ptr.DevicePointer, ptr.SizeInBytes, advice, device);
        Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemAdvise", res));
        if (res != CUResult.Success) throw new CudaException(res);
    }
    #endregion
}

/// <summary>
/// Enumerator class for CudaManagedMemory_byte
/// </summary>
public class CudaManagedMemoryEnumerator<T> : IEnumerator<T> where T : struct
{
    private CudaManagedMemory<T> _memory = null;
    private SizeT _currentIndex = -1;

    /// <summary>
    /// 
    /// </summary>
    /// <param name="memory"></param>
    public CudaManagedMemoryEnumerator(CudaManagedMemory<T> memory)
    {
        _memory = memory;
    }

    void IDisposable.Dispose() { }

    /// <summary>
    /// 
    /// </summary>
    public void Reset()
    {
        _currentIndex = -1;
    }

    /// <summary>
    /// 
    /// </summary>
    public T Current
    {
        get { return _memory[_currentIndex]; }
    }

    /// <summary>
    /// 
    /// </summary>
    object IEnumerator.Current
    {
        get { return _memory[_currentIndex]; }
    }

    /// <summary>
    /// 
    /// </summary>
    /// <returns></returns>
    public bool MoveNext()
    {
        _currentIndex += 1;
        if ((long)_currentIndex >= (long)_memory.Size)
            return false;
        else
            return true;
    }

}

/// <summary>
/// A variable located in managed memory.<para/>
/// Type: byte
/// </summary>
public unsafe class CudaManagedMemory_byte : CudaManagedMemory<byte>
{
    byte* _ptr;

    /// <summary>
    /// Creates a new CudaManagedMemory and allocates the memory on host/device.
    /// </summary>
    /// <param name="size">In elements</param>
    /// <param name="attachFlags"></param>
    public CudaManagedMemory_byte(SizeT size, CUmemAttach_flags attachFlags)
        : base(size, attachFlags)
    {
        _ptr = (byte*)(UIntPtr)_devPtr.Pointer;
    }

    /// <summary>
    /// Creates a new CudaManagedMemory from definition in cu-file.
    /// </summary>
    /// <param name="module">The module where the variable is defined in.</param>
    /// <param name="name">The variable name as defined in the cu-file.</param>
    public CudaManagedMemory_byte(CUmodule module, string name)
        : base(module, name)
    {
        _ptr = (byte*)(UIntPtr)_devPtr.Pointer;
    }

    /// <summary>
    /// Creates a new CudaManagedMemory from definition in cu-file.
    /// </summary>
    /// <param name="kernel">The kernel which module defines the variable.</param>
    /// <param name="name">The variable name as defined in the cu-file.</param>
    public CudaManagedMemory_byte(CudaKernel kernel, string name)
        : this(kernel.CUModule, name)
    {

    }

    /// <summary>
    /// Access array per element.
    /// </summary>
    /// <param name="index">index in elements</param>
    /// <returns></returns>
    public override byte this[SizeT index]
    {
        get
        {
            return _ptr[index];
        }
        set
        {
            _ptr[index] = value;
        }
    }
}
And you would just repeat the small amount of code as exampled by CudaManagedMemory_byte for each concrete version of the abstract generic class.
It preserves the ability to access memory directly while allowing the convenience of generics.
What do you think?
kunzmi / managedCuda

unified memory implementation #52