diff --git a/inference_tutorial.ipynb b/inference_tutorial.ipynb index c923a7f..ab247c4 100644 --- a/inference_tutorial.ipynb +++ b/inference_tutorial.ipynb @@ -50,7 +50,7 @@ "import sys\n", "\n", "\n", - "!{sys.executable} -m pip install einops pytorchvideo timm -q\n", + "!{sys.executable} -m pip install einops pytorchvideo timm hydra -q\n", "\n", "# only needed for the tutorial\n", "# if the video rendering doesn't work, restart the kernel after installation\n", @@ -74,8 +74,6 @@ }, "outputs": [], "source": [ - "import os \n", - "\n", "try:\n", " from omnivore.transforms import SpatialCrop, TemporalCrop, DepthNorm\n", "except:\n", @@ -87,10 +85,8 @@ "\n", "import csv\n", "import json\n", - "from typing import List\n", "\n", "import torch\n", - "import torch.nn.functional as F\n", "import torchvision.transforms as T\n", "from PIL import Image\n", "from pytorchvideo.data.encoded_video import EncodedVideo\n", @@ -104,7 +100,6 @@ "\n", "%matplotlib inline\n", "import matplotlib.pyplot as plt\n", - "import matplotlib.image as mpimg\n", "from ipywidgets import Video" ] }, @@ -134,9 +129,9 @@ "source": [ "# Device on which to run the model\n", "# Set to cuda to load on GPU\n", - "device = \"cuda\" if torch.cuda.is_available() else \"cpu\" \n", + "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n", "\n", - "# Pick a pretrained model \n", + "# Pick a pretrained model\n", "model_name = \"omnivore_swinB\"\n", "model = torch.hub.load(\"facebookresearch/omnivore:main\", model=model_name, force_reload=True)\n", "\n", @@ -185,7 +180,7 @@ "# Create an id to label name mapping\n", "imagenet_id_to_classname = {}\n", "for k, v in imagenet_classnames.items():\n", - " imagenet_id_to_classname[k] = v[1] " + " imagenet_id_to_classname[k] = v[1]" ] }, { @@ -348,8 +343,8 @@ " key=\"video\",\n", " transform=T.Compose(\n", " [\n", - " UniformTemporalSubsample(num_frames), \n", - " T.Lambda(lambda x: x / 255.0), \n", + " UniformTemporalSubsample(num_frames),\n", + " T.Lambda(lambda x: x / 255.0),\n", " ShortSideScale(size=224),\n", " NormalizeVideo(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", " TemporalCrop(frames_per_clip=32, stride=40),\n", @@ -390,7 +385,7 @@ "outputs": [], "source": [ "# Load the example video\n", - "video_path = \"dance.mp4\" \n", + "video_path = \"dance.mp4\"\n", "\n", "Video.from_file(video_path, width=500)" ] @@ -406,7 +401,7 @@ "# We crop the video to a smaller resolution and duration to save RAM\n", "!ffmpeg -y -ss 0 -i dance.mp4 -filter:v scale=224:-1 -t 1 -v 0 dance_cropped.mp4\n", "\n", - "video_path = \"dance_cropped.mp4\" " + "video_path = \"dance_cropped.mp4\"" ] }, { @@ -430,7 +425,7 @@ "# Move the inputs to the desired device\n", "video_inputs = video_data[\"video\"]\n", "\n", - "# Take the first clip \n", + "# Take the first clip\n", "# The model expects inputs of shape: B x C x T x H x W\n", "video_input = video_inputs[0][None, ...]" ] @@ -452,11 +447,11 @@ }, "outputs": [], "source": [ - "# Pass the input clip through the model \n", + "# Pass the input clip through the model\n", "with torch.no_grad():\n", " prediction = model(video_input.to(device), input_type=\"video\")\n", "\n", - " # Get the predicted classes \n", + " # Get the predicted classes\n", " pred_classes = prediction.topk(k=5).indices\n", "\n", "# Map the predicted classes to the label names\n", @@ -531,7 +526,7 @@ " T.Resize(224),\n", " T.CenterCrop(224),\n", " T.Normalize(\n", - " mean=[0.485, 0.456, 0.406, 0.0418], \n", + " mean=[0.485, 0.456, 0.406, 0.0418],\n", " std=[0.229, 0.224, 0.225, 0.0295]\n", " ),\n", " ]\n", @@ -559,7 +554,7 @@ "source": [ "# Download the example image and disparity file\n", "!wget -O store.png https://upload.wikimedia.org/wikipedia/commons/thumb/f/f4/Interior_of_the_IKEA_B%C4%83neasa_33.jpg/791px-Interior_of_the_IKEA_B%C4%83neasa_33.jpg\n", - "!wget -O store_disparity.pt https://dl.fbaipublicfiles.com/omnivore/example_data/store_disparity.pt " + "!wget -O store_disparity.pt https://dl.fbaipublicfiles.com/omnivore/example_data/store_disparity.pt" ] }, { @@ -731,8 +726,8 @@ " key=\"video\",\n", " transform=T.Compose(\n", " [\n", - " UniformTemporalSubsample(num_frames), \n", - " T.Lambda(lambda x: x / 255.0), \n", + " UniformTemporalSubsample(num_frames),\n", + " T.Lambda(lambda x: x / 255.0),\n", " ShortSideScale(size=224),\n", " NormalizeVideo(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n", " TemporalCrop(frames_per_clip=32, stride=40),\n", @@ -773,7 +768,7 @@ "outputs": [], "source": [ "# Load the example video\n", - "video_path = \"epic.mp4\" \n", + "video_path = \"epic.mp4\"\n", "\n", "Video.from_file(video_path, width=500)" ] @@ -789,7 +784,7 @@ "# We crop the video to a smaller resolution and duration to save RAM\n", "!ffmpeg -y -ss 0 -i epic.mp4 -filter:v scale=224:-1 -t 1 -v 0 epic_cropped.mp4\n", "\n", - "video_path = \"epic_cropped.mp4\" " + "video_path = \"epic_cropped.mp4\"" ] }, { @@ -812,7 +807,7 @@ "# Move the inputs to the desired device\n", "video_inputs = video_data[\"video\"]\n", "\n", - "# Take the first clip \n", + "# Take the first clip\n", "# The model expects inputs of shape: B x C x T x H x W\n", "video_input = video_inputs[0][None, ...]" ] @@ -834,11 +829,11 @@ }, "outputs": [], "source": [ - "# Pass the input clip through the model \n", + "# Pass the input clip through the model\n", "with torch.no_grad():\n", " prediction = model(video_input.to(device), input_type=\"video\")\n", "\n", - " # Get the predicted classes \n", + " # Get the predicted classes\n", " pred_classes = prediction.topk(k=5).indices\n", "\n", "# Map the predicted classes to the label names\n", diff --git a/omnimae/README.md b/omnimae/README.md index 7bafc2f..4747ed6 100644 --- a/omnimae/README.md +++ b/omnimae/README.md @@ -54,8 +54,8 @@ If this work is helpful in your research, please consider starring :star: us and ``` ## Contributing -We welcome your pull requests! Please see [CONTRIBUTING](CONTRIBUTING.md) and [CODE_OF_CONDUCT](CODE_OF_CONDUCT.md) for more information. +We welcome your pull requests! Please see [CONTRIBUTING](../CONTRIBUTING.md) and [CODE_OF_CONDUCT](../CODE_OF_CONDUCT.md) for more information. ## License -OmniMAE is released under the CC-BY-NC 4.0 license. See [LICENSE](LICENSE) for additional details. However the Swin Transformer implementation is additionally licensed under the Apache 2.0 license (see [NOTICE](NOTICE) for additional details). +OmniMAE is released under the CC-BY-NC 4.0 license. See [LICENSE](../LICENSE) for additional details. However the Swin Transformer implementation is additionally licensed under the Apache 2.0 license (see [NOTICE](../NOTICE) for additional details). diff --git a/omnivore/README.md b/omnivore/README.md index b4c8dde..cd189c0 100644 --- a/omnivore/README.md +++ b/omnivore/README.md @@ -116,8 +116,8 @@ If this work is helpful in your research, please consider starring :star: us and ``` ## Contributing -We welcome your pull requests! Please see [CONTRIBUTING](CONTRIBUTING.md) and [CODE_OF_CONDUCT](CODE_OF_CONDUCT.md) for more information. +We welcome your pull requests! Please see [CONTRIBUTING](../CONTRIBUTING.md) and [CODE_OF_CONDUCT](../CODE_OF_CONDUCT.md) for more information. ## License -Omnivore is released under the CC-BY-NC 4.0 license. See [LICENSE](LICENSE) for additional details. However the Swin Transformer implementation is additionally licensed under the Apache 2.0 license (see [NOTICE](NOTICE) for additional details). +Omnivore is released under the CC-BY-NC 4.0 license. See [LICENSE](../LICENSE) for additional details. However the Swin Transformer implementation is additionally licensed under the Apache 2.0 license (see [NOTICE](../NOTICE) for additional details).