How to lower the vector dialect to GPU or SPIRV dialect

I have a linalg dialect and then super-vectorized to vector dialect. How can I convert it to GPU or SPIRV dialect? I want to produce such a module/kernel where the arithmetics are vectorized.
The pass and the source I was using:

mlir-opt --pass-pipeline='
builtin.module(
  convert-tensor-to-linalg,arith-bufferize,func.func(empty-tensor-to-alloc-tensor,scf-bufferize,shape-bufferize,linalg-bufferize,bufferization-bufferize,tensor-bufferize),
func-bufferize,
  func.func(convert-linalg-to-affine-loops,affine-super-vectorize{virtual-vector-size=8},
  affine-loop-unroll{unroll-full})
)' 2.mlir > 3.mlir
2.mlir
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
module attributes {torch.debug_module_name = "ReLU"} {
  func.func @forward(%arg0: tensor<512x640x20x16xf32>) -> tensor<512x640x20x16xf32> {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = tensor.empty() : tensor<512x640x20x16xf32>
    %1 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg0 : tensor<512x640x20x16xf32>) outs(%0 : tensor<512x640x20x16xf32>) {
    ^bb0(%in: f32, %out: f32):
      %2 = arith.cmpf ugt, %in, %cst : f32
      %3 = arith.select %2, %in, %cst : f32
      linalg.yield %3 : f32
    } -> tensor<512x640x20x16xf32>
    return %1 : tensor<512x640x20x16xf32>
  }

  func.func @main() {
    %0= arith.constant dense<1.3>:tensor<512x640x20x16xf32>
    %1 = func.call @forward(%0) : (tensor<512x640x20x16xf32>) -> tensor<512x640x20x16xf32>
    return
  }
}
3.mlir
module attributes {torch.debug_module_name = "ReLU"} {
  memref.global "private" constant @__constant_512x640x20x16xf32 : memref<512x640x20x16xf32> = dense<1.300000e+00>
  func.func @forward(%arg0: memref<512x640x20x16xf32>) -> memref<512x640x20x16xf32> {
    %cst = arith.constant 0.000000e+00 : f32
    %alloc = memref.alloc() {alignment = 64 : i64} : memref<512x640x20x16xf32>
    affine.for %arg1 = 0 to 512 {
      affine.for %arg2 = 0 to 640 {
        affine.for %arg3 = 0 to 20 {
          affine.for %arg4 = 0 to 16 step 8 {
            %cst_0 = arith.constant dense<0.000000e+00> : vector<8xf32>
            %cst_1 = arith.constant 0.000000e+00 : f32
            %0 = vector.transfer_read %arg0[%arg1, %arg2, %arg3, %arg4], %cst_1 {in_bounds = [true]} : memref<512x640x20x16xf32>, vector<8xf32>
            %1 = arith.cmpf ugt, %0, %cst_0 : vector<8xf32>
            %2 = arith.select %1, %0, %cst_0 : vector<8xi1>, vector<8xf32>
            vector.transfer_write %2, %alloc[%arg1, %arg2, %arg3, %arg4] {in_bounds = [true]} : vector<8xf32>, memref<512x640x20x16xf32>
          }
        }
      }
    }
    return %alloc : memref<512x640x20x16xf32>
  }
  func.func @main() {
    %0 = memref.get_global @__constant_512x640x20x16xf32 : memref<512x640x20x16xf32>
    %1 = call @forward(%0) : (memref<512x640x20x16xf32>) -> memref<512x640x20x16xf32>
    return
  }
}

When I try to apply convert-vector-to-gpu or convert-vector-to-spirv to 3.mlir above, there is no effect. I guess the key point is to lower the vector.transfer_read and vector.transfer_write, but I don’t know what they should be lowered to or how to lower them.

I hope I can get a module that finally be like:

The operations inside kernel will need to be vectorized

// -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
module attributes {gpu.container_module, torch.debug_module_name = "ReLU"} {
  memref.global "private" constant @__constant_512x640x20x15xf32 : memref<512x640x20x15xf32> = dense<1.300000e+00>
  func.func @forward(%arg0: memref<512x640x20x15xf32>) -> memref<512x640x20x15xf32> {
    %c20 = arith.constant 20 : index
    %c640 = arith.constant 640 : index
    %c512 = arith.constant 512 : index
    %cst = arith.constant 0.000000e+00 : f32
    %c15 = arith.constant 15 : index
    %c1 = arith.constant 1 : index
    %c0 = arith.constant 0 : index
    %memref = gpu.alloc  host_shared () : memref<512x640x20x15xf32>
    memref.copy %arg0, %memref : memref<512x640x20x15xf32> to memref<512x640x20x15xf32>
    %memref_0 = gpu.alloc  host_shared () : memref<512x640x20x15xf32>
    gpu.launch_func  @forward_kernel::@forward_kernel blocks in (%c512, %c640, %c20) threads in (%c1, %c1, %c1)  args(%memref : memref<512x640x20x15xf32>, %cst : f32, %memref_0 : memref<512x640x20x15xf32>, %c0 : index, %c15 : index, %c1 : index)
    %alloc = memref.alloc() : memref<512x640x20x15xf32>
    memref.copy %memref_0, %alloc : memref<512x640x20x15xf32> to memref<512x640x20x15xf32>
    gpu.dealloc  %memref_0 : memref<512x640x20x15xf32>
    gpu.dealloc  %memref : memref<512x640x20x15xf32>
    return %alloc : memref<512x640x20x15xf32>
  }
  gpu.module @forward_kernel attributes {spirv.target_env = #spirv.target_env<#spirv.vce<v1.0, [Addresses, Float16Buffer, Int64, Int16, Int8, Bfloat16ConversionINTEL, Kernel, Linkage, Vector16, GenericPointer, Groups, Float16, Float64, AtomicFloat32AddEXT, ExpectAssumeKHR], [SPV_INTEL_bfloat16_conversion, SPV_EXT_shader_atomic_float_add, SPV_KHR_expect_assume]>, api=OpenCL, #spirv.resource_limits<>>} {
    gpu.func @forward_kernel(%arg0: memref<512x640x20x15xf32>, %arg1: f32, %arg2: memref<512x640x20x15xf32>, %arg3: index, %arg4: index, %arg5: index) kernel attributes {gpu.known_block_size = array<i32: 1, 1, 1>, gpu.known_grid_size = array<i32: 512, 640, 20>, spirv.entry_point_abi = #spirv.entry_point_abi<>} {
      %0 = gpu.block_id  x
      %1 = gpu.block_id  y
      %2 = gpu.block_id  z
      scf.for %arg6 = %arg3 to %arg4 step %arg5 {
        %3 = memref.load %arg0[%0, %1, %2, %arg6] : memref<512x640x20x15xf32>
        %4 = arith.cmpf ugt, %3, %arg1 : f32
        %5 = arith.select %4, %3, %arg1 : f32
        memref.store %5, %arg2[%0, %1, %2, %arg6] : memref<512x640x20x15xf32>
      }
      gpu.return
    }
  }
  func.func @main() {
    %0 = memref.get_global @__constant_512x640x20x15xf32 : memref<512x640x20x15xf32>
    %1 = call @forward(%0) : (memref<512x640x20x15xf32>) -> memref<512x640x20x15xf32>
    return
  }
}