Benchmarking

This package provides a fair amount of infrastructure for benchmarking different hashers to evaluate their performance.

Image Hashing

The below example does the following:

Download a benchmarking dataset (we provide a dataset with images that have compatible licensing for this example)
Load the dataset. If you are using your own datasets, you may wish to call deduplicate on it to ensure no duplicates are included.
Transform the dataset to generate synthetic images.
Define a new custom hasher that we want to evaluate. It’s not very good – but demonstrates how you can evaluate your own custom hash functions.
Compute all the hashes.
Report metrics for each image category / hasher / transformation combination.

import os
import glob
import zipfile
import urllib.request

import cv2
import imgaug
import tabulate # Optional: Only used for generating tables for the Sphinx documentation
import numpy as np

from perception import benchmarking, hashers
from perception.hashers.image.pdq import PDQHash

urllib.request.urlretrieve(
    "https://thorn-perception.s3.amazonaws.com/thorn-perceptual-benchmark-v0.zip",
    "thorn-perceptual-benchmark-v0.zip"
)

with zipfile.ZipFile('thorn-perceptual-benchmark-v0.zip') as f:
    f.extractall('.')

# Load the dataset
dataset = benchmarking.BenchmarkImageDataset.from_tuples(files=[
    (filepath, filepath.split(os.path.sep)[-2]) for filepath in glob.glob(
        os.path.join('thorn-perceptual-benchmark-v0', '**', '*.jpg')
    )
])

# Define the transforms we want to use for
# evaluation hash quality.
def watermark(image):
    fontScale = 5
    thickness = 5
    text = "TEXT"
    fontFace = cv2.FONT_HERSHEY_SIMPLEX
    targetWidth = 0.2*image.shape[1]
    (textWidth, textHeight), _ = cv2.getTextSize(
        text="TEST",
        fontFace=fontFace,
        fontScale=fontScale,
        thickness=thickness
    )
    fontScaleCorr = targetWidth / textWidth
    textHeight *= fontScaleCorr
    textWidth *= fontScaleCorr
    fontScale *= fontScaleCorr

    org = ( textHeight, image.shape[0] - textHeight )
    org = tuple(map(int, org))
    color = (0, 0, 0, 200)
    placeholder = cv2.putText(
        img=np.zeros(image.shape[:2] + (4, ), dtype='uint8'),
        text="TEST",
        org=org,
        color=color,
        fontFace=fontFace,
        fontScale=fontScale,
        thickness=thickness
    ).astype('float32')
    augmented = (
        (image.astype('float32')[..., :3]*(255 - placeholder[..., 3:]) + placeholder[..., :3]*placeholder[..., 3:])
    ) / 255
    return augmented.astype('uint8')

def vignette(image):
    height, width = image.shape[:2]
    a = cv2.getGaussianKernel(height, height/2)
    b = cv2.getGaussianKernel(width, width/2)
    c = (b.T*a)[..., np.newaxis]
    d = c/c.max()
    e = image*d
    return e.astype('uint8')

transforms={
    'watermark': watermark,
    'blur2': imgaug.augmenters.GaussianBlur(sigma=2.0),
    'vignette': vignette,
    'gamma2': imgaug.augmenters.GammaContrast(gamma=2),
    'jpeg95': imgaug.augmenters.JpegCompression(95),
    'pad0.2': imgaug.augmenters.Pad(percent=((0.2, 0.2), (0, 0), (0.2, 0.2), (0, 0)), keep_size=False),
    'crop0.05': imgaug.augmenters.Crop(percent=((0.05, 0.05), (0.05, 0.05), (0.05, 0.05), (0.05, 0.05)), keep_size=False),
    'noise0.2': imgaug.augmenters.AdditiveGaussianNoise(scale=0.2*255),
    'rotate4': imgaug.augmenters.Affine(rotate=4),
    'noop': imgaug.augmenters.Resize({"longer-side": 256, "shorter-side": "keep-aspect-ratio"}),
}

# Compute the transformed versions of the images.
# This takes a while but you can reload the
# generated dataset without recomputing it (see next line).
transformed = dataset.transform(
    transforms=transforms,
    storage_dir='transformed',
    errors="raise"
)
# We don't actually have to do this, but it shows
# how to reload the transformed dataset later.
transformed = benchmarking.BenchmarkImageTransforms.load(
    path_to_zip_or_directory='transformed', verify_md5=False
)

# Create a new hash that we want to evaluate.
# perception will handle most of the plumbing but
# we do have to specify a few things.
class ShrinkHash(hashers.ImageHasher):
    """This is a simple hash to demonstrate how you
    can create your own hasher and compare it to others.
    It just shrinks images to 8x8 pixels and then flattens
    the result.
    """

    # We have to let perception know
    # the shape and type of our hash.
    hash_length = 64
    dtype = 'uint8'

    # We need to specify how distance is
    # computed between hashes.
    distance_metric = 'euclidean'

    def _compute(self, image):
        gray = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)
        resized = cv2.resize(gray, dsize=(8, 8))
        return resized.flatten()

hashers_dict = {
    'ahash': hashers.AverageHash(hash_size=16),
    'dhash': hashers.DHash(hash_size=16),
    'pdq': PDQHash(),
    'phash': hashers.PHash(hash_size=16),
    'marrhildreth': hashers.MarrHildreth(),
    'wavelet': hashers.WaveletHash(hash_size=16),
    'blockmean': hashers.BlockMean(),
    'shrinkhash': ShrinkHash()
}

# Compute the hashes
hashes = transformed.compute_hashes(hashers=hashers_dict)

# Get performance metrics (i.e., recall) for each hash function based on
# a minimum precision threshold. Here we use 99.99%.
precision_threshold = 99.99

# The metrics are just pandas dataframes. We use tabulate here to obtain the tables
# formatted for the documentation.
metrics = hashes.compute_threshold_recall(precision_threshold=precision_threshold).reset_index()
print(tabulate.tabulate(metrics, showindex=False, headers=metrics.columns, tablefmt='rst'))

metrics_by_transform = hashes.compute_threshold_recall(grouping=['transform_name'], precision_threshold=precision_threshold).reset_index()
print(tabulate.tabulate(metrics_by_transform, showindex=False, headers=metrics_by_transform.columns, tablefmt='rst'))

metrics_simple = hashes.compute_threshold_recall(grouping=[], precision_threshold=precision_threshold).reset_index()
print(tabulate.tabulate(metrics_simple, showindex=False, headers=metrics_simple.columns, tablefmt='rst'))

category	transform_name	hasher_name	threshold	recall	precision	n_exemplars
paintings	blur2	ahash	0.0078125	51.724	100	2204
paintings	blur2	blockmean	0.0123967	85.753	100	2204
paintings	blur2	dhash	0.105469	100	100	2204
paintings	blur2	marrhildreth	0.0989583	100	100	2204
paintings	blur2	pdq	0.117188	100	100	2204
paintings	blur2	phash	0.0390625	100	100	2204
paintings	blur2	shrinkhash	60.8112	43.33	100	2204
paintings	blur2	wavelet	0.0117188	66.379	100	2204
paintings	crop0.05	ahash	0.00390625	0.045	100	2204
paintings	crop0.05	blockmean	0.0123967	0.227	100	2204
paintings	crop0.05	dhash	0.210938	7.577	100	2204
paintings	crop0.05	marrhildreth	0.213542	3.584	100	2204
paintings	crop0.05	pdq	0.257812	8.439	100	2204
paintings	crop0.05	phash	0.226562	6.76	100	2204
paintings	crop0.05	shrinkhash	95.0053	2.269	100	2204
paintings	crop0.05	wavelet	0.0078125	0	nan	2204
paintings	gamma2	ahash	0.00390625	0.998	100	2204
paintings	gamma2	blockmean	0.0072314	1.724	100	2204
paintings	gamma2	dhash	0.167969	98.639	100	2204
paintings	gamma2	marrhildreth	0.159722	99.41	100	2204
paintings	gamma2	pdq	0.164062	100	100	2204
paintings	gamma2	phash	0.164062	100	100	2204
paintings	gamma2	shrinkhash	46.5296	0	nan	2204
paintings	gamma2	wavelet	0.0117188	18.512	100	2204
paintings	jpeg95	ahash	0.00390625	4.22	100	2204
paintings	jpeg95	blockmean	0.0134298	28.811	100	2204
paintings	jpeg95	dhash	0.191406	94.782	100	2204
paintings	jpeg95	marrhildreth	0.168403	82.985	100	2204
paintings	jpeg95	pdq	0.257812	100	100	2204
paintings	jpeg95	phash	0.234375	100	100	2204
paintings	jpeg95	shrinkhash	66.053	55.172	100	2204
paintings	jpeg95	wavelet	0	0	nan	2204
paintings	noise0.2	ahash	0.00390625	2.677	100	2204
paintings	noise0.2	blockmean	0.00826446	6.987	100	2204
paintings	noise0.2	dhash	0.25	93.648	100	2204
paintings	noise0.2	marrhildreth	0.170139	73.911	100	2204
paintings	noise0.2	pdq	0.257812	99.229	100	2204
paintings	noise0.2	phash	0.257812	100	100	2204
paintings	noise0.2	shrinkhash	169.387	3.312	100	2204
paintings	noise0.2	wavelet	0.0078125	1.407	100	2204
paintings	noop	ahash	0	100	100	2204
paintings	noop	blockmean	0	100	100	2204
paintings	noop	dhash	0	100	100	2204
paintings	noop	marrhildreth	0	100	100	2204
paintings	noop	pdq	0	100	100	2204
paintings	noop	phash	0	100	100	2204
paintings	noop	shrinkhash	0	100	100	2204
paintings	noop	wavelet	0	100	100	2204
paintings	pad0.2	ahash	0.0703125	0	nan	2204
paintings	pad0.2	blockmean	0.0795455	0	nan	2204
paintings	pad0.2	dhash	0.210938	1.089	100	2204
paintings	pad0.2	marrhildreth	0.177083	0	nan	2204
paintings	pad0.2	pdq	0.289062	1.86	100	2204
paintings	pad0.2	phash	0.273438	2.541	100	2204
paintings	pad0.2	shrinkhash	146.325	0.181	100	2204
paintings	pad0.2	wavelet	0.109375	0	nan	2204
paintings	resize0.5	ahash	0.0078125	76.089	100	2204
paintings	resize0.5	blockmean	0.0144628	98.185	100	2204
paintings	resize0.5	dhash	0.0976562	100	100	2204
paintings	resize0.5	marrhildreth	0.154514	99.819	100	2204
paintings	resize0.5	pdq	0.1875	100	100	2204
paintings	resize0.5	phash	0.09375	100	100	2204
paintings	resize0.5	shrinkhash	56.9034	76.27	100	2204
paintings	resize0.5	wavelet	0.0117188	84.71	100	2204
paintings	rotate4	ahash	0.0390625	2.949	100	2204
paintings	rotate4	blockmean	0.0382231	2.949	100	2204
paintings	rotate4	dhash	0.207031	36.298	100	2204
paintings	rotate4	marrhildreth	0.227431	61.978	100	2204
paintings	rotate4	pdq	0.273438	56.08	100	2204
paintings	rotate4	phash	0.257812	61.615	100	2204
paintings	rotate4	shrinkhash	69.1737	2.813	100	2204
paintings	rotate4	wavelet	0.03125	0.136	100	2204
paintings	vignette	ahash	0.0429688	6.171	100	2204
paintings	vignette	blockmean	0.0475207	8.122	100	2204
paintings	vignette	dhash	0.121094	32.305	100	2204
paintings	vignette	marrhildreth	0.177083	77.904	100	2204
paintings	vignette	pdq	0.132812	100	100	2204
paintings	vignette	phash	0.132812	100	100	2204
paintings	vignette	shrinkhash	102.186	3.267	100	2204
paintings	vignette	wavelet	0.046875	3.085	100	2204
paintings	watermark	ahash	0.00390625	20.054	100	2204
paintings	watermark	blockmean	0.0123967	45.145	100	2204
paintings	watermark	dhash	0.0585938	100	100	2204
paintings	watermark	marrhildreth	0.0625	100	100	2204
paintings	watermark	pdq	0.273438	98.866	100	2204
paintings	watermark	phash	0.28125	99.456	100	2204
paintings	watermark	shrinkhash	104.398	75.998	100	2204
paintings	watermark	wavelet	0.0117188	51.27	100	2204
photographs	blur2	ahash	0.015625	76.727	100	1650
photographs	blur2	blockmean	0.0330579	98	100	1650
photographs	blur2	dhash	0.0859375	98.97	100	1650
photographs	blur2	marrhildreth	0.107639	97.576	100	1650
photographs	blur2	pdq	0.304688	100	100	1650
photographs	blur2	phash	0.179688	100	100	1650
photographs	blur2	shrinkhash	117.627	44	100	1650
photographs	blur2	wavelet	0.0195312	79.879	100	1650
photographs	crop0.05	ahash	0.0078125	0.182	100	1650
photographs	crop0.05	blockmean	0.0258264	0.788	100	1650
photographs	crop0.05	dhash	0.0976562	1.091	100	1650
photographs	crop0.05	marrhildreth	0.173611	3.152	100	1650
photographs	crop0.05	pdq	0.304688	30.606	100	1650
photographs	crop0.05	phash	0.320312	63.697	100	1650
photographs	crop0.05	shrinkhash	125.94	1.152	100	1650
photographs	crop0.05	wavelet	0.015625	0.182	100	1650
photographs	gamma2	ahash	0.015625	8.182	100	1650
photographs	gamma2	blockmean	0.0268595	17.212	100	1650
photographs	gamma2	dhash	0.101562	90.303	100	1650
photographs	gamma2	marrhildreth	0.105903	90.909	100	1650
photographs	gamma2	pdq	0.210938	100	100	1650
photographs	gamma2	phash	0.234375	100	100	1650
photographs	gamma2	shrinkhash	119.683	0.545	100	1650
photographs	gamma2	wavelet	0.0195312	18.424	100	1650
photographs	jpeg95	ahash	0.0117188	29.879	100	1650
photographs	jpeg95	blockmean	0.0278926	76.788	100	1650
photographs	jpeg95	dhash	0.121094	84.182	100	1650
photographs	jpeg95	marrhildreth	0.104167	69.576	100	1650
photographs	jpeg95	pdq	0.296875	99.879	100	1650
photographs	jpeg95	phash	0.28125	99.879	100	1650
photographs	jpeg95	shrinkhash	131.031	89.212	100	1650
photographs	jpeg95	wavelet	0.0195312	40.242	100	1650
photographs	noise0.2	ahash	0.015625	27.636	100	1650
photographs	noise0.2	blockmean	0.036157	75.091	100	1650
photographs	noise0.2	dhash	0.121094	54.121	100	1650
photographs	noise0.2	marrhildreth	0.0989583	46.364	100	1650
photographs	noise0.2	pdq	0.296875	99.697	100	1650
photographs	noise0.2	phash	0.304688	99.818	100	1650
photographs	noise0.2	shrinkhash	210.661	57.576	100	1650
photographs	noise0.2	wavelet	0.0234375	27.03	100	1650
photographs	noop	ahash	0	100	100	1650
photographs	noop	blockmean	0	100	100	1650
photographs	noop	dhash	0	100	100	1650
photographs	noop	marrhildreth	0	100	100	1650
photographs	noop	pdq	0	100	100	1650
photographs	noop	phash	0	100	100	1650
photographs	noop	shrinkhash	0	100	100	1650
photographs	noop	wavelet	0	100	100	1650
photographs	pad0.2	ahash	0.0429688	0.061	100	1650
photographs	pad0.2	blockmean	0.0320248	0	nan	1650
photographs	pad0.2	dhash	0.105469	0.545	100	1650
photographs	pad0.2	marrhildreth	0.177083	0.121	100	1650
photographs	pad0.2	pdq	0.28125	1.455	100	1650
photographs	pad0.2	phash	0.289062	3.515	100	1650
photographs	pad0.2	shrinkhash	114.721	0.061	100	1650
photographs	pad0.2	wavelet	0.0820312	0	nan	1650
photographs	resize0.5	ahash	0.015625	87.697	100	1650
photographs	resize0.5	blockmean	0.0330579	99.152	100	1650
photographs	resize0.5	dhash	0.0898438	98.485	100	1650
photographs	resize0.5	marrhildreth	0.111111	95.394	100	1650
photographs	resize0.5	pdq	0.328125	99.818	100	1650
photographs	resize0.5	phash	0.234375	100	100	1650
photographs	resize0.5	shrinkhash	132.117	80.242	100	1650
photographs	resize0.5	wavelet	0.0195312	88.97	100	1650
photographs	rotate4	ahash	0.0273438	1.818	100	1650
photographs	rotate4	blockmean	0.0371901	3.879	100	1650
photographs	rotate4	dhash	0.09375	2.97	100	1650
photographs	rotate4	marrhildreth	0.149306	4.606	100	1650
photographs	rotate4	pdq	0.304688	73.394	100	1650
photographs	rotate4	phash	0.3125	89.818	100	1650
photographs	rotate4	shrinkhash	130.211	4.424	100	1650
photographs	rotate4	wavelet	0.0078125	0.061	100	1650
photographs	vignette	ahash	0.0273438	8.242	100	1650
photographs	vignette	blockmean	0.0320248	10	100	1650
photographs	vignette	dhash	0.0703125	22	100	1650
photographs	vignette	marrhildreth	0.0954861	38.727	100	1650
photographs	vignette	pdq	0.117188	100	100	1650
photographs	vignette	phash	0.125	100	100	1650
photographs	vignette	shrinkhash	138.989	11.939	100	1650
photographs	vignette	wavelet	0.0195312	4.242	100	1650
photographs	watermark	ahash	0.015625	42.667	100	1650
photographs	watermark	blockmean	0.0247934	60.788	100	1650
photographs	watermark	dhash	0.078125	100	100	1650
photographs	watermark	marrhildreth	0.112847	98.727	100	1650
photographs	watermark	pdq	0.3125	99.818	100	1650
photographs	watermark	phash	0.3125	99.758	100	1650
photographs	watermark	shrinkhash	142.046	79.576	100	1650
photographs	watermark	wavelet	0.0195312	53.455	100	1650

transform_name	hasher_name	threshold	recall	precision	n_exemplars
blur2	ahash	0.0078125	49.014	100	3854
blur2	blockmean	0.0123967	80.773	100	3854
blur2	dhash	0.0859375	99.196	100	3854
blur2	marrhildreth	0.107639	98.962	100	3854
blur2	pdq	0.234375	99.948	100	3854
blur2	phash	0.179688	100	100	3854
blur2	shrinkhash	60.8112	28.412	100	3854
blur2	wavelet	0.0117188	62.247	100	3854
crop0.05	ahash	0.00390625	0.052	100	3854
crop0.05	blockmean	0.0123967	0.208	100	3854
crop0.05	dhash	0.0976562	0.493	100	3854
crop0.05	marrhildreth	0.173611	1.635	100	3854
crop0.05	pdq	0.257812	9.03	100	3854
crop0.05	phash	0.226562	7.058	100	3854
crop0.05	shrinkhash	95.0053	1.427	100	3854
crop0.05	wavelet	0.0078125	0	nan	3854
gamma2	ahash	0.00390625	0.934	100	3854
gamma2	blockmean	0.0072314	1.713	100	3854
gamma2	dhash	0.101562	90.036	100	3854
gamma2	marrhildreth	0.105903	94.24	100	3854
gamma2	pdq	0.210938	100	100	3854
gamma2	phash	0.234375	100	100	3854
gamma2	shrinkhash	108.457	0.156	100	3854
gamma2	wavelet	0.0117188	14.997	100	3854
jpeg95	ahash	0.00390625	5.319	100	3854
jpeg95	blockmean	0.0134298	32.045	100	3854
jpeg95	dhash	0.121094	74.079	100	3854
jpeg95	marrhildreth	0.104167	59.263	100	3854
jpeg95	pdq	0.257812	99.896	100	3854
jpeg95	phash	0.234375	99.896	100	3854
jpeg95	shrinkhash	66.053	40.296	100	3854
jpeg95	wavelet	0.00390625	3.71	100	3854
noise0.2	ahash	0.00390625	2.984	100	3854
noise0.2	blockmean	0.00826446	8.563	100	3854
noise0.2	dhash	0.121094	40.088	100	3854
noise0.2	marrhildreth	0.0989583	33.083	100	3854
noise0.2	pdq	0.257812	99.222	100	3854
noise0.2	phash	0.273438	99.896	100	3854
noise0.2	shrinkhash	169.387	4.385	100	3854
noise0.2	wavelet	0.0078125	1.894	100	3854
noop	ahash	0	100	100	3854
noop	blockmean	0	100	100	3854
noop	dhash	0	100	100	3854
noop	marrhildreth	0	100	100	3854
noop	pdq	0	100	100	3854
noop	phash	0	100	100	3854
noop	shrinkhash	0	100	100	3854
noop	wavelet	0	100	100	3854
pad0.2	ahash	0.0429688	0.026	100	3854
pad0.2	blockmean	0.0320248	0	nan	3854
pad0.2	dhash	0.105469	0.234	100	3854
pad0.2	marrhildreth	0.177083	0.052	100	3854
pad0.2	pdq	0.28125	1.349	100	3854
pad0.2	phash	0.273438	2.387	100	3854
pad0.2	shrinkhash	114.721	0.052	100	3854
pad0.2	wavelet	0.0820312	0	nan	3854
resize0.5	ahash	0.0078125	70.784	100	3854
resize0.5	blockmean	0.0144628	95.226	100	3854
resize0.5	dhash	0.0898438	99.299	100	3854
resize0.5	marrhildreth	0.112847	97.846	100	3854
resize0.5	pdq	0.265625	99.844	100	3854
resize0.5	phash	0.234375	100	100	3854
resize0.5	shrinkhash	56.9034	51.453	100	3854
resize0.5	wavelet	0.0117188	80.747	100	3854
rotate4	ahash	0.0273438	1.297	100	3854
rotate4	blockmean	0.0371901	3.036	100	3854
rotate4	dhash	0.09375	1.401	100	3854
rotate4	marrhildreth	0.149306	3.762	100	3854
rotate4	pdq	0.273438	54.489	100	3854
rotate4	phash	0.257812	59.626	100	3854
rotate4	shrinkhash	69.1737	1.894	100	3854
rotate4	wavelet	0.0078125	0.026	100	3854
vignette	ahash	0.0273438	4.67	100	3854
vignette	blockmean	0.0320248	6.098	100	3854
vignette	dhash	0.0703125	12.195	100	3854
vignette	marrhildreth	0.0954861	30.54	100	3854
vignette	pdq	0.132812	100	100	3854
vignette	phash	0.132812	100	100	3854
vignette	shrinkhash	103.005	4.541	100	3854
vignette	wavelet	0.0195312	1.946	100	3854
watermark	ahash	0.00390625	18.5	100	3854
watermark	blockmean	0.0123967	41.593	100	3854
watermark	dhash	0.078125	100	100	3854
watermark	marrhildreth	0.112847	99.455	100	3854
watermark	pdq	0.273438	99.014	100	3854
watermark	phash	0.28125	99.377	100	3854
watermark	shrinkhash	104.398	71.199	100	3854
watermark	wavelet	0.0117188	46.912	100	3854

hasher_name	threshold	recall	precision	n_exemplars
ahash	0.00390625	17.578	100	42394
blockmean	0.00826446	27.714	100	42394
dhash	0.0859375	51.981	99.9952	42394
marrhildreth	0.100694	55.942	99.9957	42394
pdq	0.257812	77.181	99.9969	42394
phash	0.273438	81.967	99.9942	42394
shrinkhash	56.9034	22.378	100	42394
wavelet	0.00390625	18.467	100	42394

Video Hashing

The below example does the following:

Download a benchmarking dataset. Here we use the Charades dataset which contain over 9,000 videos.
Load the dataset.
Transform the dataset to generate synthetically altered videos. Our hashers are responsible for matching the altered videos with the originals.
Define some hashers we want to evaluate.
Compute all the hashes.
Report metrics for each video category / hasher / transformation combination to see how well our hashers can match the altered videos to the original (“no-op” videos).

import os
import zipfile
import urllib.request


import pandas as pd

import perception.benchmarking
import perception.hashers

if not os.path.isdir('Charades_v1_480'):
    # Download the dataset since it appears we do not have it. Note that
    # these are large files (> 13GB).
    urllib.request.urlretrieve(
        url='http://ai2-website.s3.amazonaws.com/data/Charades_v1_480.zip',
        filename='Charades_v1_480.zip'
    )
    with zipfile.ZipFile('Charades_v1_480.zip') as zfile:
        zfile.extractall('.')
    urllib.request.urlretrieve(
        url='http://ai2-website.s3.amazonaws.com/data/Charades.zip',
        filename='Charades.zip'
    )
    with zipfile.ZipFile('Charades.zip') as zfile:
        zfile.extractall('.')


# These are files that we've identified as having identical subsequences, typically
# when a person is out of frame and the backgrounds are the same.
duplicates = [
    ('0HVVN.mp4', 'UZRQD.mp4'), ('ZIOET.mp4', 'YGXX6.mp4'), ('82XPD.mp4', 'E7QDZ.mp4'),
    ('FQDS1.mp4', 'AIOTI.mp4'), ('PBV4T.mp4', 'XXYWL.mp4'), ('M0P0H.mp4', 'STY6W.mp4'),
    ('3Q92U.mp4', 'GHPO3.mp4'), ('NFIQM.mp4', 'I2DHG.mp4'), ('PIRMO.mp4', '0GFE8.mp4'),
    ('LRPBA.mp4', '9VK0J.mp4'), ('UI0QG.mp4', 'FHXKQ.mp4'), ('Y05U8.mp4', '4RVZB.mp4'),
    ('J6TVB.mp4', '2ZBL5.mp4'), ('A8T8V.mp4', 'IGOQK.mp4'), ('H8QM1.mp4', 'QYMWC.mp4'),
    ('O45BC.mp4', 'ZS7X6.mp4'), ('NOP6W.mp4', 'F7KFE.mp4'), ('4MPPQ.mp4', 'A3M94.mp4'),
    ('L8FFR.mp4', 'M8MP0.mp4'), ('EHYXP.mp4', 'O8PO3.mp4'), ('MGBLJ.mp4', 'RIEG6.mp4'),
    ('53FPM.mp4', 'BLFEV.mp4'), ('UIIF3.mp4', 'TKEKQ.mp4'), ('GVX7E.mp4', '7GPSY.mp4'),
    ('T7HZB.mp4', '6KGZA.mp4'), ('65M4K.mp4', 'UDGP2.mp4'), ('6SS4H.mp4', 'CK6OL.mp4'),
    ('OVHFT.mp4', 'GG1X2.mp4'), ('VEHER.mp4', 'XBPEJ.mp4'), ('WN38A.mp4', '2QI8F.mp4'),
    ('UMXKN.mp4', 'EOKJ0.mp4'), ('OSIKP.mp4', 'WT2C0.mp4'), ('H5V2Y.mp4', 'ZXN6A.mp4'),
    ('XS6PF.mp4', '1WJ6O.mp4'), ('S2XJW.mp4', 'YH0BX.mp4'), ('UO607.mp4', 'Z5JZD.mp4'),
    ('XN64E.mp4', 'CSRZM.mp4'), ('YXI7M.mp4', 'IKQLJ.mp4'), ('1B9C8.mp4', '004QE.mp4'),
    ('V1SQH.mp4', '48WOM.mp4'), ('107YZ.mp4', 'I049A.mp4'), ('3S6WL.mp4', 'SC5YW.mp4'),
    ('OY50Q.mp4', '5T607.mp4'), ('XKH7W.mp4', '028CE.mp4'), ('X8XQE.mp4', 'J0VXY.mp4'),
    ('STB0G.mp4', 'J0VXY.mp4'), ('UNXLF.mp4', 'J0VXY.mp4'), ('56PK0.mp4', 'M1TZR.mp4'),
    ('FVITB.mp4', 'R0M34.mp4'), ('BPZE3.mp4', 'R0M34.mp4'), ('VS7DA.mp4', '1X0M3.mp4'),
    ('I7MEA.mp4', 'YMM1Z.mp4'), ('9N76L.mp4', '0LDP7.mp4'), ('AXS82.mp4', 'W8WRK.mp4'),
    ('8TSU4.mp4', 'MXATD.mp4'), ('80FWF.mp4', '18HFG.mp4'), ('RO3A2.mp4', 'V4HY4.mp4'),
    ('HU409.mp4', 'BDWIX.mp4'), ('3YY88.mp4', 'EHHRS.mp4'), ('65RS3.mp4', 'SLIH4.mp4'),
    ('LR0L8.mp4', 'Y665P.mp4'), ('DVPL2.mp4', 'EI5M3.mp4'), ('0EGNU.mp4', 'CU3JE.mp4'),
    ('94KP4.mp4', '94KP4.mp4'), ('79QDP.mp4', '79QDP.mp4'), ('GKBX9.mp4', 'GKBX9.mp4'),
    ('RX6R8.mp4', 'RX6R8.mp4'), ('PMVT7.mp4', 'PMVT7.mp4'), ('XNXW6.mp4', 'XNXW6.mp4'),
    ('I005F.mp4', 'I005F.mp4'), ('TF95Y.mp4', 'TF95Y.mp4'), ('79QDP.mp4', '79QDP.mp4'),
    ('LQGMM.mp4', 'LQGMM.mp4'), ('QCAUL.mp4', 'QCAUL.mp4'), ('GFVSV.mp4', 'GFVSV.mp4'),
    ('4UYGY.mp4', '4UYGY.mp4'), ('BYDSE.mp4', 'BYDSE.mp4'), ('PV3KQ.mp4', 'PV3KQ.mp4'),
    ('1X0M3.mp4', '1X0M3.mp4'), ('T5FHD.mp4', 'T5FHD.mp4'), ('QRHJJ.mp4', 'QRHJJ.mp4'),
    ('JYBGS.mp4', 'JYBGS.mp4'), ('N2XCF.mp4', 'N2XCF.mp4'), ('OZPA9.mp4', 'OZPA9.mp4'),
    ('297S4.mp4', '297S4.mp4'), ('LHU7D.mp4', 'LHU7D.mp4'), ('TSKZL.mp4', 'TSKZL.mp4'),
    ('BCONW.mp4', 'BCONW.mp4'), ('KBPDM.mp4', 'KBPDM.mp4'), ('7FTBS.mp4', '7FTBS.mp4'),
    ('099Y1.mp4', '099Y1.mp4'), ('S2RIQ.mp4', 'S2RIQ.mp4'), ('22FJU.mp4', '22FJU.mp4'),
    ('99UA6.mp4', '99UA6.mp4'), ('WJ13E.mp4', 'WJ13E.mp4'), ('5OLVC.mp4', '5OLVC.mp4'),
    ('YQ6Z6.mp4', 'YQ6Z6.mp4'), ('T5MLJ.mp4', 'T5MLJ.mp4'), ('0VOQC.mp4', '0VOQC.mp4'),
    ('S2RIQ.mp4', 'S2RIQ.mp4'), ('2VNXF.mp4', '2VNXF.mp4'), ('G87XG.mp4', 'G87XG.mp4'),
    ('RRS54.mp4', 'RRS54.mp4'), ('TXJK7.mp4', 'TXJK7.mp4'), ('G4KE3.mp4', 'G4KE3.mp4'),
    ('3SNSC.mp4', '3SNSC.mp4'), ('U2FA5.mp4', 'U2FA5.mp4'), ('9AFQ7.mp4', '9AFQ7.mp4')
]

blacklist = [fp1 for fp1, fp2 in duplicates]
df = pd.concat([pd.read_csv('Charades/Charades_v1_test.csv'), pd.read_csv('Charades/Charades_v1_train.csv')])
df = df[~(df['id'] + '.mp4').isin(blacklist)]
df['filepath'] = df['id'].apply(lambda video_id: os.path.join('Charades_v1_480', video_id + '.mp4'))
assert df['filepath'].apply(os.path.isfile).all(), 'Some video files are missing.'
dataset = perception.benchmarking.BenchmarkVideoDataset.from_tuples(
    files=df[['filepath', 'scene']].itertuples(index=False)
)

if not os.path.isdir('benchmarking_videos'):
    # We haven't computed the transforms yet, so we do that
    # now. Below, we create the following files for each of
    # the videos in our dataset. Note that the only required
    # transform is `noop` (see documentation for
    # perception.bencharmking.BenchmarkVideoDataset.transform).
    #
    # noop: This is the base video we'll actually use in benchmarking, rather
    #       than using the raw video. It is the same as the raw video but downsampled
    #       to a size that is reasonable for hashing (240p). This is because all
    #       of our hashers downsample to a size smaller than this anyway, so there
    #       is no benefit to a higher resolution. Also, we limit the length to the
    #       first five minutes of the video, which speeds everything up significantly.
    # shrink: Shrink the noop video down to 70% of its original size.
    # clip0.2: Clip the first 20% and last 20% of the noop video off.
    # slideshow: Create a slideshow version of the video that grabs frames periodically
    #            from the original.
    # black_frames: Add black frames before and after the start of the video.
    # gif: Create a GIF from the video (similar to slideshow but with re-encoding)
    # black_padding: Add black bars to the top and bottom of the video.
    pad_width = 240
    pad_height = 320
    transforms = {
        'noop': perception.benchmarking.video_transforms.get_simple_transform(
            width='ceil(min(240/max(iw, ih), 1)*iw/2)*2',
            height='ceil(min(240/max(iw, ih), 1)*ih/2)*2',
            codec='h264',
            output_ext='.m4v',
            sar='1/1',
            clip_s=(None, 60*5)
        ),
        'shrink': perception.benchmarking.video_transforms.get_simple_transform(
            width='ceil(0.7*iw/2)*2',
            height='ceil(0.7*ih/2)*2'
        ),
        'clip0.2': perception.benchmarking.video_transforms.get_simple_transform(clip_pct=(0.2, 0.8)),
        'slideshow': perception.benchmarking.video_transforms.get_slideshow_transform(
            frame_input_rate=1/2.5, frame_output_rate=0.5, max_frames=10, offset=1.3),
        'black_frames': perception.benchmarking.video_transforms.get_black_frame_padding_transform(0.5, 0.05),
        'gif': perception.benchmarking.video_transforms.get_simple_transform(
            output_ext='.gif', codec='gif', clip_s=(1.2, 10.2), fps=1/2.5
        ),
        'black_padding': perception.benchmarking.video_transforms.get_simple_transform(
            width=f'(iw*sar)*min({pad_width}/(iw*sar),{pad_height}/ih)', height=f'ih*min({pad_width}/(iw*sar),{pad_height}/ih)',
            pad=f'{pad_width}:{pad_height}:({pad_width}-iw*min({pad_width}/iw,{pad_height}/ih))/2:({pad_height}-ih*min({pad_width}/iw,{pad_height}/ih))/2'
        )
    }

    # Save the transforms for later.
    transformed = dataset.transform(transforms=transforms, storage_dir='benchmarking_videos')

transformed = perception.benchmarking.BenchmarkVideoTransforms.load('benchmarking_videos', verify_md5=False)

phashu8 = perception.hashers.PHashU8(exclude_first_term=False, freq_shift=1, hash_size=12)
hashers = {
    'phashu8_framewise': perception.hashers.FramewiseHasher(
        frames_per_second=1, frame_hasher=phashu8, interframe_threshold=50, quality_threshold=90),
    'phashu8_tmkl1': perception.hashers.FramewiseHasher(
        base_hasher=perception.hashers.TMKL1(
            frames_per_second=5, frame_hasher=phashu8,
            distance_metric='euclidean', dtype='uint8',
            norm=None, quality_threshold=90)
    )
}
if not os.path.isfile('hashes.csv'):
    # We haven't computed the hashes, so we do that now.
    hashes = transformed.compute_hashes(hashers=hashers, max_workers=5)
    # Save the hashes for later. It took a long time after all!
    hashes.save('hashes.csv')

hashes = perception.benchmarking.BenchmarkHashes.load('hashes.csv')

hashes.compute_threshold_recall(precision_threshold=99.9, grouping=['transform_name'])

transform_name	hasher_name	threshold	recall	precision	n_exemplars
black_frames	phashu8_framewise	51.0979	88.12	99.9069	278644
black_frames	phashu8_tmkl1	55.7584	99.918	99.9079	403768
black_padding	phashu8_framewise	74.6391	7.662	100	277399
black_padding	phashu8_tmkl1	53.8702	99.898	99.9079	406899
clip0.2	phashu8_framewise	54.8635	90.741	99.9098	224264
clip0.2	phashu8_tmkl1	59.0424	99.724	99.9077	324251
gif	phashu8_framewise	55.4437	68.21	99.9088	82232
gif	phashu8_tmkl1	55.4887	81.029	99.9103	39757
noop	phashu8_framewise	0	100	100	282658
noop	phashu8_tmkl1	0	100	100	408871
shrink	phashu8_framewise	24.7184	100	100	281731
shrink	phashu8_tmkl1	49.8999	99.836	99.9078	400650
slideshow	phashu8_framewise	56.9825	99.713	99.9076	172829
slideshow	phashu8_tmkl1	56.8683	95.934	99.9035	90684