Closed githubthunder closed 1 year ago
You may also try the following snippet with this repository:
def test_objaverse(self):
import tqdm
import torch.nn.functional as F
us = []
feats = []
self.model.eval()
if self.config.training.use_text_proj:
self.text_proj.eval()
clip_text_feat = torch.from_numpy(self.objaverse_lvis_loader.dataset.clip_cat_feat).to(self.config.device)
if self.config.training.use_text_proj:
clip_text_feat = self.text_proj(clip_text_feat)
with torch.no_grad():
for data in tqdm.tqdm(self.objaverse_lvis_loader):
if not self.config.model.get("use_dense", False):
pred_feat = self.model(data['xyz'], data['features'], \
device = self.config.device, \
quantization_size = self.config.model.voxel_size)
else:
pred_feat = self.model(data['xyz_dense'], data['features_dense'])
us.extend(data['name'])
feats.append(torch.tensor(quantize(pred_feat.cpu().numpy())))
torch.save({"us": us, "feats": torch.concat(feats)}, "/root/objaverse.pt")
You may also try the following snippet with this repository:
def test_objaverse(self): import tqdm import torch.nn.functional as F us = [] feats = [] self.model.eval() if self.config.training.use_text_proj: self.text_proj.eval() clip_text_feat = torch.from_numpy(self.objaverse_lvis_loader.dataset.clip_cat_feat).to(self.config.device) if self.config.training.use_text_proj: clip_text_feat = self.text_proj(clip_text_feat) with torch.no_grad(): for data in tqdm.tqdm(self.objaverse_lvis_loader): if not self.config.model.get("use_dense", False): pred_feat = self.model(data['xyz'], data['features'], \ device = self.config.device, \ quantization_size = self.config.model.voxel_size) else: pred_feat = self.model(data['xyz_dense'], data['features_dense']) us.extend(data['name']) feats.append(torch.tensor(quantize(pred_feat.cpu().numpy()))) torch.save({"us": us, "feats": torch.concat(feats)}, "/root/objaverse.pt")
@eliphatfs thanks for your reply
I have tried to extract the embeddings of the objects from the objaverse dataset with the model "openshape-pointbert-vitg14-rgb". But the embedding is different from that in the "openshape-objaverse-embeddings" with the same uid.
Is it due to the operation sampling the 10000 points?
How different are they? We have farthest point sampling in the tokenization layer that involves randomness, and minor differences are expected.
HI, @eliphatfs
For the the file 0155b81b90fd4585b388f5c0e34bbdfb.glb, the cosine similarity score is about 0.95 which is calculated between the embedding extracted using the model "openshape-pointbert-vitg14-rgb" and that from the "objaverse_embeddings" with the same uid.
Does it mean that they are different? or Did I do something wrong?
The following codes are extracted from the repository "openshape-demo-support"
def trimesh_to_pc(scene_or_mesh):
if isinstance(scene_or_mesh, trimesh.Scene):
meshes = []
for node_name in scene_or_mesh.graph.nodes_geometry:
# which geometry does this node refer to
transform, geometry_name = scene_or_mesh.graph[node_name]
# get the actual potential mesh instance
geometry = scene_or_mesh.geometry[geometry_name].copy()
if not hasattr(geometry, 'triangles'):
continue
geometry: trimesh.Trimesh
geometry = geometry.apply_transform(transform)
meshes.append(geometry)
total_area = sum(geometry.area for geometry in meshes)
if total_area < 1e-6:
raise ValueError("Bad geometry: total area too small (< 1e-6)")
pcs = []
for geometry in meshes:
pcs.append(model_to_pc(geometry, max(1, round(geometry.area / total_area * 10000))))
if not len(pcs):
raise ValueError("Unsupported mesh object: no triangles found")
return numpy.concatenate(pcs)
else:
assert isinstance(scene_or_mesh, trimesh.Trimesh)
return model_to_pc(scene_or_mesh, 10000)
def model_to_pc(mesh: trimesh.Trimesh, n_sample_points=10000):
f32 = numpy.float32
rad = numpy.sqrt(mesh.area / (3 * n_sample_points))
for _ in range(24):
pcd, face_idx = trimesh.sample.sample_surface_even(mesh, n_sample_points, rad)
rad *= 0.85
if len(pcd) == n_sample_points:
break
else:
raise ValueError("Bad geometry, cannot finish sampling.", mesh.area)
if isinstance(mesh.visual, trimesh.visual.ColorVisuals):
rgba = mesh.visual.face_colors[face_idx]
elif isinstance(mesh.visual, trimesh.visual.TextureVisuals):
bc = trimesh.proximity.points_to_barycentric(mesh.triangles[face_idx], pcd)
if mesh.visual.uv is None or len(mesh.visual.uv) < mesh.faces[face_idx].max():
uv = numpy.zeros([len(bc), 2])
print("Invalid UV, filling with zeroes")
else:
uv = numpy.einsum('ntc,nt->nc', mesh.visual.uv[mesh.faces[face_idx]], bc)
material = mesh.visual.material
if hasattr(material, 'materials'):
if len(material.materials) == 0:
rgba = numpy.ones_like(pcd) * 0.8
texture = None
print("Empty MultiMaterial found, falling back to light grey")
else:
material = material.materials[0]
if hasattr(material, 'image'):
texture = material.image
if texture is None:
rgba = numpy.zeros([len(uv), len(material.main_color)]) + material.main_color
elif hasattr(material, 'baseColorTexture'):
texture = material.baseColorTexture
if texture is None:
rgba = numpy.zeros([len(uv), len(material.main_color)]) + material.main_color
else:
texture = None
rgba = numpy.ones_like(pcd) * 0.8
print("Unknown material, falling back to light grey")
if texture is not None:
rgba = trimesh.visual.uv_to_interpolated_color(uv, texture)
if rgba.max() > 1:
if rgba.max() > 255:
rgba = rgba.astype(f32) / rgba.max()
else:
rgba = rgba.astype(f32) / 255.0
return numpy.concatenate([numpy.array(pcd, f32), numpy.array(rgba, f32)[:, :3]], axis=-1)
def load_pc_data(glb_file: str, swap_yz_axes: bool=False):
glb_obj = trimesh.load(glb_file)
pc = trimesh_to_pc(glb_obj)
assert pc.ndim == 2, "invalid pc shape: ndim = %d != 2" % pc.ndim
assert pc.shape[1] in [3, 6], "invalid pc shape: should have 3/6 channels, got %d" % pc.shape[1]
pc = pc.astype(f32)
if swap_yz_axes:
pc[:, [1, 2]] = pc[:, [2, 1]]
pc[:, :3] = pc[:, :3] - numpy.mean(pc[:, :3], axis=0)
pc[:, :3] = pc[:, :3] / numpy.linalg.norm(pc[:, :3], axis=-1).max()
if pc.shape[1] == 3:
pc = numpy.concatenate([pc, numpy.ones_like(pc) * 0.4], axis=-1)
if pc.shape[0] >= 10000:
pc = pc[numpy.random.permutation(len(pc))[:10000]]
elif pc.shape[0] == 0:
raise ValueError("Got empty point cloud!")
elif pc.shape[0] < 10000:
pc = numpy.concatenate([pc, pc[numpy.random.randint(len(pc), size=[10000 - len(pc)])]])
return pc.astype(f32)
name = 'openshape-pointbert-vitg14-rgb'
pc_encoder = openshape.load_pc_encoder(name=name)
glb_file = "/data/0155b81b90fd4585b388f5c0e34bbdfb.glb"
pc_data = load_pc_data(glb_file=glb_file, swap_yz_axes=False)
pc_encoder.eval()
with torch.no_grad():
pc_enc = pc_encoder(torch.tensor(pc_data[:, [0, 2, 1, 3, 4, 5]].T[None], device="cuda:0")).cpu()
meta_path = "/data/objaverse_meta.json"
feat_path = "/data/objaverse.pt"
meta, us, feats = load_objaverse_embeddings(meta_path=meta_path, feat_path=feat_path)
dict_uid_to_idx = {k: v for v, k in enumerate(us)}
obj_id = "0155b81b90fd4585b388f5c0e34bbdfb"
idx = dict_us_to_idx[obj_id]
obja_emb = feats[idx]
obja_emb = obja_emb.type('torch.FloatTensor')
obja_emb /= obja_emb.norm(dim=-1, keepdim=True)
pc_enc /= pc_enc.norm(dim=-1, keepdim=True)
score = torch.dot(obja_emb, pc_enc[0])
I think the similarity is fairly high, and shall not interfere much with the downstream applications. I currently cannot tell where the difference comes from and I might check it later.
I think the similarity is fairly high, and shall not interfere much with the downstream applications. I currently cannot tell where the difference comes from and I might check it later.
thanks a lot for your reply
I did some toy experiments and basically the FPS randomness can cause the embeddings to fluctuate with similarity mostly in 97-99; resampling can cause the embeddings to fluctuate with similarity between 94-99. Thus it is natural to have an object embedding that has 95 similarity with ours.
I did some toy experiments and basically the FPS randomness can cause the embeddings to fluctuate with similarity mostly in 97-99; resampling can cause the embeddings to fluctuate with similarity between 94-99. Thus it is natural to have an object embedding that has 95 similarity with ours.
@eliphatfs
The following snippet performs the operation "sampling" including FPS or resampling. Have I got that right?
pcd, face_idx = trimesh.sample.sample_surface_even(mesh, n_sample_points, rad)
FPS is inside the model (check the PointBERT paper), where 10000 points are split into k (k = 384 in the g14 model) patches randomly distributed across the point cloud.
Sampling is the code you just pasted. It randomly samples 10000 points from the surface of the objects. By 're'-sampling I simply mean running it twice, as the distribution of these points has its randomness.
FPS is inside the model (check the PointBERT paper), where 10000 points are split into k (k = 384 in the g14 model) patches randomly distributed across the point cloud.
Sampling is the code you just pasted. It randomly samples 10000 points from the surface of the objects. By 're'-sampling I simply mean running it twice, as the distribution of these points has its randomness.
I get it now. And thanks again for your reply.
HI, @Colin97 @eliphatfs Thanks for sharing your excellent work.
I wonder how to extract the embeddings of the objects in the dataset objaverse? which model was used ?