# https://codeandlife.com/2023/01/26/mastering-the-huggingface-clip-model-how-to-extract-embeddings-and-calculate-similarity-for-text-and-images/
import torch
from PIL import Image
from transformers import AutoProcessor, AutoTokenizer, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")

# Get the text features
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-large-patch14")


open_subtask = [
    "locate the computer",
    "locate the book",
    "locate the remote control",
    "transport the computer to the sofa",
    "transport the book to the sofa",
    "transport the remote control to the sofa",
]
inputs = tokenizer(open_subtask, padding=True, return_tensors="pt")
text_features = model.get_text_features(**inputs)

print(text_features.shape)  # output shape of text features

# Get the image features
processor = AutoProcessor.from_pretrained("openai/clip-vit-large-patch14")

images = [Image.open(f"test_exploration_images/frame_{i}.png") for i in range(4)]
# images are from 4 different rotations of the same scene
# [0, 90, 180, 270] degrees

inputs = processor(images=images, return_tensors="pt")

image_features = model.get_image_features(**inputs)
print(image_features.shape)

image_features_norm = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
text_features_norm = text_features / text_features.norm(p=2, dim=-1, keepdim=True)

sim_scores = torch.matmul(text_features_norm, image_features_norm.t())
# add sim_scores across all subtasks to get the final similarity score for each direction
sim_scores_final = sim_scores.sum(0)
