2
0
mirror of https://github.com/Shawn-Shan/fawkes.git synced 2026-06-12 21:50:46 +05:30

add fawkes-lite and fawkes-dev

This commit is contained in:
Shawn-Shan
2020-06-28 19:13:14 -05:00
parent 5b15437b69
commit 7fc3a82437
11 changed files with 1092 additions and 86 deletions
+57
View File
@@ -0,0 +1,57 @@
# Fawkes
Code implementation of the paper "[Fawkes: Protecting Personal Privacy against Unauthorized Deep Learning Models](https://arxiv.org/pdf/2002.08327.pdf)", at *USENIX Security 2020*.
### BEFORE YOU RUN OUR CODE
We appreciate your interest in our work and for trying out our code. We've noticed several cases where incorrect configuration leads to poor performances of protection. If you also observe low detection performance far away from what we presented in the paper, please feel free to open an issue in this repo or contact any of the authors directly. We are more than happy to help you debug your experiment and find out the correct configuration.
### ABOUT
This repository contains code implementation of the paper "[Fawkes: Protecting Personal Privacy against Unauthorized Deep Learning Models](https://arxiv.org/pdf/2002.08327.pdf)", at *USENIX Security 2020*.
### DEPENDENCIES
Our code is implemented and tested on Keras with TensorFlow backend. Following packages are used by our code.
- `keras==2.3.1`
- `numpy==1.18.4`
- `tensorflow-gpu==1.13.1`
Our code is tested on `Python 3.6.8`
### HOWTO
#### Download and Config Datasets
The first step is to download several datasets for protection and target selection.
1. Download the following dataset to your local machine. After downloading the datasets, restructure it the same way as the FaceScrub dataset downloaded.
- FaceScrub -- used for protection evaluation (link)
- VGGFace1 -- used for target select (link)
- VGGFace2 -- used for target select (link)
- WebFace -- used for target select (link)
2. Config datasets
open `fawkes/config.py` and update the `DATASETS` dictionary with the path to each dataset. Then run `python fawkes/config.py`. Every time the datasets are updated or moved, remember to rerun the command with the updated path.
3. Calculate embeddings using feature extractor.
Run `python3 fawkes/prepare_feature_extractor.py --candidate-datasets scrub vggface1 vggface2 webface`. This will calculate and cache the embeddings using the default feature extractor we provide. To use a customized feature extractor, please look at the Advance section at the end.
#### Generate Cloak for Images
To generate cloak, run
`python3 fawkes/protection.py --gpu 0 --dataset scrub --feature-extractor webface_dense_robust_extract`
For more information about the detailed parameters, please read `fawkes/protection.py`.
The code will output a directory in `results/` with `cloak_data.p` inside. You can check the cloaked images or inspect the changes in `this notebook`.
#### Evaluate Cloak Effectiveness
To evaluate the cloak, run `python3 fawkes/eval_cloak.py --gpu 0 --cloak_data PATH-TO-RESULT-DIRECTORY --transfer_model vggface2_inception_extract`.
The code will print out the tracker model accuracy on uncloaked/original test images of the protected user, which should be close to 0.
### Citation
```
@inproceedings{shan2020fawkes,
title={Fawkes: Protecting Personal Privacy against Unauthorized Deep Learning Models},
author={Shan, Shawn and Wenger, Emily and Zhang, Jiayun and Li, Huiying and Zheng, Haitao and Zhao, Ben Y},
booktitle="Proc. of USENIX Security",
year={2020}
}
```
View File
+40
View File
@@ -0,0 +1,40 @@
import glob
import json
import os
DATASETS = {
"scrub": "../data/scrub/",
"vggface1": "/mnt/data/sixiongshan/data/vggface/",
# "vggface2": "/mnt/data/sixiongshan/data/vggface2/",
"webface": "/mnt/data/sixiongshan/data/webface/",
# "youtubeface": "/mnt/data/sixiongshan/data/youtubeface/keras_flow_data/",
}
def main():
config = {}
for dataset in DATASETS.keys():
path = DATASETS[dataset]
if not os.path.exists(path):
print("Dataset path for {} does not exist, skipped".format(dataset))
continue
train_dir = os.path.join(path, "train")
test_dir = os.path.join(path, "test")
if not os.path.exists(train_dir):
print("Training dataset path for {} does not exist, skipped".format(dataset))
continue
num_classes = len(os.listdir(train_dir))
num_images = len(glob.glob(os.path.join(train_dir, "*/*")))
if num_images == 0 or num_classes == 0 or num_images == num_classes:
raise Exception("Dataset {} is not setup as detailed in README.".format(dataset))
config[dataset] = {"train_dir": train_dir, "test_dir": test_dir, "num_classes": num_classes,
"num_images": num_images}
print("Successfully config {}".format(dataset))
j = json.dumps(config)
with open("config.json", "wb") as f:
f.write(j.encode())
if __name__ == '__main__':
main()
+431
View File
@@ -0,0 +1,431 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2020-05-17
# @Author : Shawn Shan (shansixiong@cs.uchicago.edu)
# @Link : https://www.shawnshan.com/
import datetime
import time
from decimal import Decimal
import numpy as np
import tensorflow as tf
from utils import preprocess, reverse_preprocess
class FawkesMaskGeneration:
# if the attack is trying to mimic a target image or a neuron vector
MIMIC_IMG = True
# number of iterations to perform gradient descent
MAX_ITERATIONS = 10000
# larger values converge faster to less accurate results
LEARNING_RATE = 1e-2
# the initial constant c to pick as a first guess
INITIAL_CONST = 1
# pixel intensity range
INTENSITY_RANGE = 'imagenet'
# threshold for distance
L_THRESHOLD = 0.03
# whether keep the final result or the best result
KEEP_FINAL = False
# max_val of image
MAX_VAL = 255
# The following variables are used by DSSIM, should keep as default
# filter size in SSIM
FILTER_SIZE = 11
# filter sigma in SSIM
FILTER_SIGMA = 1.5
# weights used in MS-SSIM
SCALE_WEIGHTS = None
MAXIMIZE = False
IMAGE_SHAPE = (224, 224, 3)
RATIO = 1.0
LIMIT_DIST = False
def __init__(self, sess, bottleneck_model_ls, mimic_img=MIMIC_IMG,
batch_size=1, learning_rate=LEARNING_RATE,
max_iterations=MAX_ITERATIONS, initial_const=INITIAL_CONST,
intensity_range=INTENSITY_RANGE, l_threshold=L_THRESHOLD,
max_val=MAX_VAL, keep_final=KEEP_FINAL, maximize=MAXIMIZE, image_shape=IMAGE_SHAPE,
verbose=0, ratio=RATIO, limit_dist=LIMIT_DIST):
assert intensity_range in {'raw', 'imagenet', 'inception', 'mnist'}
# constant used for tanh transformation to avoid corner cases
self.tanh_constant = 2 - 1e-6
self.sess = sess
self.MIMIC_IMG = mimic_img
self.LEARNING_RATE = learning_rate
self.MAX_ITERATIONS = max_iterations
self.initial_const = initial_const
self.batch_size = batch_size
self.intensity_range = intensity_range
self.l_threshold = l_threshold
self.max_val = max_val
self.keep_final = keep_final
self.verbose = verbose
self.maximize = maximize
self.learning_rate = learning_rate
self.ratio = ratio
self.limit_dist = limit_dist
self.single_shape = list(image_shape)
self.input_shape = tuple([self.batch_size] + self.single_shape)
self.bottleneck_shape = tuple([self.batch_size] + self.single_shape)
# the variable we're going to optimize over
self.modifier = tf.Variable(np.zeros(self.input_shape, dtype=np.float32))
# target image in tanh space
if self.MIMIC_IMG:
self.timg_tanh = tf.Variable(np.zeros(self.input_shape), dtype=np.float32)
else:
self.bottleneck_t_raw = tf.Variable(np.zeros(self.bottleneck_shape), dtype=np.float32)
# source image in tanh space
self.simg_tanh = tf.Variable(np.zeros(self.input_shape), dtype=np.float32)
self.const = tf.Variable(np.ones(batch_size), dtype=np.float32)
self.mask = tf.Variable(np.ones((batch_size), dtype=np.bool))
self.weights = tf.Variable(np.ones(self.bottleneck_shape,
dtype=np.float32))
# and here's what we use to assign them
self.assign_modifier = tf.placeholder(tf.float32, self.input_shape)
if self.MIMIC_IMG:
self.assign_timg_tanh = tf.placeholder(
tf.float32, self.input_shape)
else:
self.assign_bottleneck_t_raw = tf.placeholder(
tf.float32, self.bottleneck_shape)
self.assign_simg_tanh = tf.placeholder(tf.float32, self.input_shape)
self.assign_const = tf.placeholder(tf.float32, (batch_size))
self.assign_mask = tf.placeholder(tf.bool, (batch_size))
self.assign_weights = tf.placeholder(tf.float32, self.bottleneck_shape)
# the resulting image, tanh'd to keep bounded from -0.5 to 0.5
# adversarial image in raw space
self.aimg_raw = (tf.tanh(self.modifier + self.simg_tanh) /
self.tanh_constant +
0.5) * 255.0
# source image in raw space
self.simg_raw = (tf.tanh(self.simg_tanh) /
self.tanh_constant +
0.5) * 255.0
if self.MIMIC_IMG:
# target image in raw space
self.timg_raw = (tf.tanh(self.timg_tanh) /
self.tanh_constant +
0.5) * 255.0
# convert source and adversarial image into input space
if self.intensity_range == 'imagenet':
mean = tf.constant(np.repeat([[[[103.939, 116.779, 123.68]]]], self.batch_size, axis=0), dtype=tf.float32,
name='img_mean')
self.aimg_input = (self.aimg_raw[..., ::-1] - mean)
self.simg_input = (self.simg_raw[..., ::-1] - mean)
if self.MIMIC_IMG:
self.timg_input = (self.timg_raw[..., ::-1] - mean)
elif self.intensity_range == 'raw':
self.aimg_input = self.aimg_raw
self.simg_input = self.simg_raw
if self.MIMIC_IMG:
self.timg_input = self.timg_raw
def batch_gen_DSSIM(aimg_raw_split, simg_raw_split):
msssim_split = tf.image.ssim(aimg_raw_split, simg_raw_split, max_val=255.0)
dist = (1.0 - tf.stack(msssim_split)) / 2.0
return dist
# raw value of DSSIM distance
self.dist_raw = batch_gen_DSSIM(self.aimg_raw, self.simg_raw)
# distance value after applying threshold
self.dist = tf.maximum(self.dist_raw - self.l_threshold, 0.0)
self.dist_raw_sum = tf.reduce_sum(
tf.where(self.mask,
self.dist_raw,
tf.zeros_like(self.dist_raw)))
self.dist_sum = tf.reduce_sum(tf.where(self.mask, self.dist, tf.zeros_like(self.dist)))
def resize_tensor(input_tensor, model_input_shape):
if input_tensor.shape[1:] == model_input_shape or model_input_shape[1] is None:
return input_tensor
resized_tensor = tf.image.resize(input_tensor, model_input_shape[:2])
return resized_tensor
def calculate_direction(bottleneck_model, cur_timg_input, cur_simg_input):
target_features = bottleneck_model(cur_timg_input)
return target_features
# target_center = tf.reduce_mean(target_features, axis=0)
# original = bottleneck_model(cur_simg_input)
# original_center = tf.reduce_mean(original, axis=0)
# direction = target_center - original_center
# final_target = original + self.ratio * direction
# return final_target
self.bottlesim = 0.0
self.bottlesim_sum = 0.0
self.bottlesim_push = 0.0
for bottleneck_model in bottleneck_model_ls:
model_input_shape = bottleneck_model.input_shape[1:]
cur_aimg_input = resize_tensor(self.aimg_input, model_input_shape)
self.bottleneck_a = bottleneck_model(cur_aimg_input)
if self.MIMIC_IMG:
# cur_timg_input = resize_tensor(self.timg_input, model_input_shape)
# cur_simg_input = resize_tensor(self.simg_input, model_input_shape)
cur_timg_input = self.timg_input
cur_simg_input = self.simg_input
self.bottleneck_t = calculate_direction(bottleneck_model, cur_timg_input, cur_simg_input)
# self.bottleneck_t = bottleneck_model(cur_timg_input)
else:
self.bottleneck_t = self.bottleneck_t_raw
bottleneck_diff = self.bottleneck_t - self.bottleneck_a
scale_factor = tf.sqrt(tf.reduce_sum(tf.square(self.bottleneck_t), axis=1))
cur_bottlesim = tf.sqrt(tf.reduce_sum(tf.square(bottleneck_diff), axis=1))
cur_bottlesim = cur_bottlesim / scale_factor
cur_bottlesim_sum = tf.reduce_sum(cur_bottlesim)
self.bottlesim += cur_bottlesim
# self.bottlesim_push += cur_bottlesim_push_sum
self.bottlesim_sum += cur_bottlesim_sum
# sum up the losses
if self.maximize:
self.loss = self.const * tf.square(self.dist) - self.bottlesim
else:
self.loss = self.const * tf.square(self.dist) + self.bottlesim
self.loss_sum = tf.reduce_sum(tf.where(self.mask, self.loss, tf.zeros_like(self.loss)))
# Setup the Adadelta optimizer and keep track of variables
# we're creating
start_vars = set(x.name for x in tf.global_variables())
self.learning_rate_holder = tf.placeholder(tf.float32, shape=[])
# optimizer = tf.train.AdadeltaOptimizer(self.learning_rate_holder)
optimizer = tf.train.AdamOptimizer(self.learning_rate_holder)
self.train = optimizer.minimize(self.loss_sum,
var_list=[self.modifier])
end_vars = tf.global_variables()
new_vars = [x for x in end_vars if x.name not in start_vars]
# these are the variables to initialize when we run
self.setup = []
self.setup.append(self.modifier.assign(self.assign_modifier))
if self.MIMIC_IMG:
self.setup.append(self.timg_tanh.assign(self.assign_timg_tanh))
else:
self.setup.append(self.bottleneck_t_raw.assign(
self.assign_bottleneck_t_raw))
self.setup.append(self.simg_tanh.assign(self.assign_simg_tanh))
self.setup.append(self.const.assign(self.assign_const))
self.setup.append(self.mask.assign(self.assign_mask))
self.setup.append(self.weights.assign(self.assign_weights))
self.init = tf.variables_initializer(var_list=[self.modifier] + new_vars)
print('Attacker loaded')
def preprocess_arctanh(self, imgs):
imgs = reverse_preprocess(imgs, self.intensity_range)
imgs /= 255.0
imgs -= 0.5
imgs *= self.tanh_constant
tanh_imgs = np.arctanh(imgs)
return tanh_imgs
def clipping(self, imgs):
imgs = reverse_preprocess(imgs, self.intensity_range)
imgs = np.clip(imgs, 0, self.max_val)
imgs = np.rint(imgs)
imgs = preprocess(imgs, self.intensity_range)
return imgs
def attack(self, source_imgs, target_imgs, weights=None):
if weights is None:
weights = np.ones([source_imgs.shape[0]] +
list(self.bottleneck_shape[1:]))
assert weights.shape[1:] == self.bottleneck_shape[1:]
assert source_imgs.shape[1:] == self.input_shape[1:]
assert source_imgs.shape[0] == weights.shape[0]
if self.MIMIC_IMG:
assert target_imgs.shape[1:] == self.input_shape[1:]
assert source_imgs.shape[0] == target_imgs.shape[0]
else:
assert target_imgs.shape[1:] == self.bottleneck_shape[1:]
assert source_imgs.shape[0] == target_imgs.shape[0]
start_time = time.time()
adv_imgs = []
print('%d batches in total'
% int(np.ceil(len(source_imgs) / self.batch_size)))
for idx in range(0, len(source_imgs), self.batch_size):
print('processing batch %d at %s' % (idx, datetime.datetime.now()))
adv_img = self.attack_batch(source_imgs[idx:idx + self.batch_size],
target_imgs[idx:idx + self.batch_size],
weights[idx:idx + self.batch_size])
adv_imgs.extend(adv_img)
elapsed_time = time.time() - start_time
print('attack cost %f s' % (elapsed_time))
return np.array(adv_imgs)
def attack_batch(self, source_imgs, target_imgs, weights):
"""
Run the attack on a batch of images and labels.
"""
LR = self.learning_rate
nb_imgs = source_imgs.shape[0]
mask = [True] * nb_imgs + [False] * (self.batch_size - nb_imgs)
mask = np.array(mask, dtype=np.bool)
source_imgs = np.array(source_imgs)
target_imgs = np.array(target_imgs)
# convert to tanh-space
simg_tanh = self.preprocess_arctanh(source_imgs)
if self.MIMIC_IMG:
timg_tanh = self.preprocess_arctanh(target_imgs)
else:
timg_tanh = target_imgs
CONST = np.ones(self.batch_size) * self.initial_const
self.sess.run(self.init)
simg_tanh_batch = np.zeros(self.input_shape)
if self.MIMIC_IMG:
timg_tanh_batch = np.zeros(self.input_shape)
else:
timg_tanh_batch = np.zeros(self.bottleneck_shape)
weights_batch = np.zeros(self.bottleneck_shape)
simg_tanh_batch[:nb_imgs] = simg_tanh[:nb_imgs]
timg_tanh_batch[:nb_imgs] = timg_tanh[:nb_imgs]
weights_batch[:nb_imgs] = weights[:nb_imgs]
modifier_batch = np.ones(self.input_shape) * 1e-6
# set the variables so that we don't have to send them over again
if self.MIMIC_IMG:
self.sess.run(self.setup,
{self.assign_timg_tanh: timg_tanh_batch,
self.assign_simg_tanh: simg_tanh_batch,
self.assign_const: CONST,
self.assign_mask: mask,
self.assign_weights: weights_batch,
self.assign_modifier: modifier_batch})
else:
# if directly mimicking a vector, use assign_bottleneck_t_raw
# in setup
self.sess.run(self.setup,
{self.assign_bottleneck_t_raw: timg_tanh_batch,
self.assign_simg_tanh: simg_tanh_batch,
self.assign_const: CONST,
self.assign_mask: mask,
self.assign_weights: weights_batch,
self.assign_modifier: modifier_batch})
best_bottlesim = [0] * nb_imgs if self.maximize else [np.inf] * nb_imgs
best_adv = np.zeros_like(source_imgs)
if self.verbose == 1:
loss_sum = float(self.sess.run(self.loss_sum))
dist_sum = float(self.sess.run(self.dist_sum))
thresh_over = (dist_sum / self.batch_size / self.l_threshold * 100)
dist_raw_sum = float(self.sess.run(self.dist_raw_sum))
bottlesim_sum = self.sess.run(self.bottlesim_sum)
print('START: Total loss: %.4E; perturb: %.6f (%.2f%% over, raw: %.6f); sim: %f'
% (Decimal(loss_sum),
dist_sum,
thresh_over,
dist_raw_sum,
bottlesim_sum / nb_imgs))
try:
total_distance = [0] * nb_imgs
if self.limit_dist:
dist_raw_list, bottlesim_list, aimg_input_list = self.sess.run(
[self.dist_raw,
self.bottlesim,
self.aimg_input])
for e, (dist_raw, bottlesim, aimg_input) in enumerate(
zip(dist_raw_list, bottlesim_list, aimg_input_list)):
if e >= nb_imgs:
break
total_distance[e] = bottlesim
for iteration in range(self.MAX_ITERATIONS):
self.sess.run([self.train], feed_dict={self.learning_rate_holder: LR})
dist_raw_list, bottlesim_list, aimg_input_list = self.sess.run(
[self.dist_raw,
self.bottlesim,
self.aimg_input])
for e, (dist_raw, bottlesim, aimg_input) in enumerate(
zip(dist_raw_list, bottlesim_list, aimg_input_list)):
if e >= nb_imgs:
break
if (bottlesim < best_bottlesim[e] and bottlesim > total_distance[e] * 0.1 and (
not self.maximize)) or (
bottlesim > best_bottlesim[e] and self.maximize):
best_bottlesim[e] = bottlesim
best_adv[e] = aimg_input
if iteration != 0 and iteration % (self.MAX_ITERATIONS // 3) == 0:
LR = LR / 2
print("Learning Rate: ", LR)
if iteration % (self.MAX_ITERATIONS // 10) == 0:
if self.verbose == 1:
loss_sum = float(self.sess.run(self.loss_sum))
dist_sum = float(self.sess.run(self.dist_sum))
thresh_over = (dist_sum /
self.batch_size /
self.l_threshold *
100)
dist_raw_sum = float(self.sess.run(self.dist_raw_sum))
bottlesim_sum = self.sess.run(self.bottlesim_sum)
print('ITER %4d: Total loss: %.4E; perturb: %.6f (%.2f%% over, raw: %.6f); sim: %f'
% (iteration,
Decimal(loss_sum),
dist_sum,
thresh_over,
dist_raw_sum,
bottlesim_sum / nb_imgs))
except KeyboardInterrupt:
pass
if self.verbose == 1:
loss_sum = float(self.sess.run(self.loss_sum))
dist_sum = float(self.sess.run(self.dist_sum))
thresh_over = (dist_sum / self.batch_size / self.l_threshold * 100)
dist_raw_sum = float(self.sess.run(self.dist_raw_sum))
bottlesim_sum = float(self.sess.run(self.bottlesim_sum))
print('END: Total loss: %.4E; perturb: %.6f (%.2f%% over, raw: %.6f); sim: %f'
% (Decimal(loss_sum),
dist_sum,
thresh_over,
dist_raw_sum,
bottlesim_sum / nb_imgs))
best_adv = self.clipping(best_adv[:nb_imgs])
return best_adv
+157
View File
@@ -0,0 +1,157 @@
import sys
sys.path.append("/home/shansixioing/tools/")
sys.path.append("/home/shansixioing/cloak/")
import argparse
from tensorflow import set_random_seed
from utils import init_gpu, load_extractor, load_victim_model, dump_dictionary_as_json
import os
import numpy as np
import random
import pickle
import re
from keras.preprocessing import image
from keras.utils import to_categorical
from keras.applications.vgg16 import preprocess_input
# import locale
#
# loc = locale.getlocale()
# locale.setlocale(locale.LC_ALL, loc)
def select_samples(data_dir):
all_data_path = []
for cls in os.listdir(data_dir):
cls_dir = os.path.join(data_dir, cls)
for data_path in os.listdir(cls_dir):
all_data_path.append(os.path.join(cls_dir, data_path))
return all_data_path
def generator_wrap(cloak_data, n_classes, test=False, validation_split=0.1):
if test:
all_data_path = select_samples(cloak_data.test_data_dir)
else:
all_data_path = select_samples(cloak_data.train_data_dir)
split = int(len(cloak_data.cloaked_protect_train_X) * (1 - validation_split))
cloaked_train_X = cloak_data.cloaked_protect_train_X[:split]
np.random.seed(12345)
# all_vals = list(cloak_data.path2idx.items())
while True:
batch_X = []
batch_Y = []
cur_batch_path = np.random.choice(all_data_path, args.batch_size)
for p in cur_batch_path:
# p = p.encode("utf-8").decode("ascii", 'ignore')
cur_y = cloak_data.path2idx[p]
# protect class and sybil class do not need to appear in test dataset
if test and (re.search(cloak_data.protect_class, p)):
continue
# protect class images in train dataset
elif p in cloak_data.protect_class_path:
cur_x = random.choice(cloaked_train_X)
else:
im = image.load_img(p, target_size=cloak_data.img_shape)
im = image.img_to_array(im)
cur_x = preprocess_input(im)
batch_X.append(cur_x)
batch_Y.append(cur_y)
batch_X = np.array(batch_X)
batch_Y = to_categorical(np.array(batch_Y), num_classes=n_classes)
yield batch_X, batch_Y
def eval_uncloaked_test_data(cloak_data, n_classes):
original_label = cloak_data.path2idx[list(cloak_data.protect_class_path)[0]]
protect_test_X = cloak_data.protect_test_X
original_Y = [original_label] * len(protect_test_X)
original_Y = to_categorical(original_Y, n_classes)
return protect_test_X, original_Y
def eval_cloaked_test_data(cloak_data, n_classes, validation_split=0.1):
split = int(len(cloak_data.cloaked_protect_train_X) * (1 - validation_split))
cloaked_test_X = cloak_data.cloaked_protect_train_X[split:]
original_label = cloak_data.path2idx[list(cloak_data.protect_class_path)[0]]
original_Y = [original_label] * len(cloaked_test_X)
original_Y = to_categorical(original_Y, n_classes)
return cloaked_test_X, original_Y
def main():
init_gpu(args.gpu)
if args.dataset == 'pubfig':
N_CLASSES = 65
CLOAK_DIR = args.cloak_data
elif args.dataset == 'scrub':
N_CLASSES = 530
CLOAK_DIR = args.cloak_data
else:
raise ValueError
CLOAK_DIR = os.path.join("../results", CLOAK_DIR)
RES = pickle.load(open(os.path.join(CLOAK_DIR, "cloak_data.p"), 'rb'))
print("Build attacker's model")
cloak_data = RES['cloak_data']
EVAL_RES = {}
train_generator = generator_wrap(cloak_data, n_classes=N_CLASSES,
validation_split=args.validation_split)
test_generator = generator_wrap(cloak_data, test=True, n_classes=N_CLASSES,
validation_split=args.validation_split)
EVAL_RES['transfer_model'] = args.transfer_model
base_model = load_extractor(args.transfer_model)
model = load_victim_model(teacher_model=base_model, number_classes=N_CLASSES)
original_X, original_Y = eval_uncloaked_test_data(cloak_data, N_CLASSES)
cloaked_test_X, cloaked_test_Y = eval_cloaked_test_data(cloak_data, N_CLASSES,
validation_split=args.validation_split)
try:
model.fit_generator(train_generator, steps_per_epoch=cloak_data.number_samples // 32,
validation_data=(original_X, original_Y), epochs=args.n_epochs, verbose=2,
use_multiprocessing=False, workers=1)
except KeyboardInterrupt:
pass
_, acc_original = model.evaluate(original_X, original_Y, verbose=0)
print("Accuracy on uncloaked/original images TEST: {:.4f}".format(acc_original))
EVAL_RES['acc_original'] = acc_original
_, other_acc = model.evaluate_generator(test_generator, verbose=0, steps=50)
print("Accuracy on other classes {:.4f}".format(other_acc))
EVAL_RES['other_acc'] = other_acc
dump_dictionary_as_json(EVAL_RES, os.path.join(CLOAK_DIR, "eval_seed{}.json".format(args.seed_idx)))
def parse_arguments(argv):
parser = argparse.ArgumentParser()
parser.add_argument('--gpu', type=str,
help='GPU id', default='0')
parser.add_argument('--dataset', type=str,
help='name of dataset', default='scrub')
parser.add_argument('--cloak_data', type=str,
help='name of the cloak result directory',
default='scrub_webface_dense_robust_extract_protectPatrick_Dempsey')
parser.add_argument('--transfer_model', type=str,
help='the feature extractor used for tracker model training. It can be the same or not same as the user\'s', default='vggface2_inception_extract')
parser.add_argument('--batch_size', type=int, default=32)
parser.add_argument('--validation_split', type=float, default=0.1)
parser.add_argument('--n_epochs', type=int, default=5)
return parser.parse_args(argv)
if __name__ == '__main__':
args = parse_arguments(sys.argv[1:])
main()
+64
View File
@@ -0,0 +1,64 @@
import argparse
import os
import pickle
import random
import sys
import numpy as np
from keras.applications.vgg16 import preprocess_input
from keras.preprocessing import image
from utils import load_extractor, get_dataset_path
def load_sample_dir(path, sample=10):
x_ls = []
image_paths = list(os.listdir(path))
random.shuffle(image_paths)
for i, file in enumerate(image_paths):
if i > sample:
break
cur_path = os.path.join(path, file)
im = image.load_img(cur_path, target_size=(224, 224))
im = image.img_to_array(im)
x_ls.append(im)
raw_x = np.array(x_ls)
return preprocess_input(raw_x)
def normalize(x):
return x / np.linalg.norm(x)
def main():
extractor = load_extractor(args.feature_extractor)
path2emb = {}
for target_dataset in args.candidate_datasets:
target_dataset_path, _, _, _ = get_dataset_path(target_dataset)
for target_class in os.listdir(target_dataset_path):
target_class_path = os.path.join(target_dataset_path, target_class)
target_X = load_sample_dir(target_class_path)
cur_feature = extractor.predict(target_X)
cur_feature = np.mean(cur_feature, axis=0)
path2emb[target_class_path] = cur_feature
for k, v in path2emb.items():
path2emb[k] = normalize(v)
pickle.dump(path2emb, open("../feature_extractors/embeddings/{}_emb_norm.p".format(args.feature_extractor), "wb"))
def parse_arguments(argv):
parser = argparse.ArgumentParser()
parser.add_argument('--gpu', type=str,
help='GPU id', default='0')
parser.add_argument('--candidate-datasets', nargs='+',
help='path candidate datasets')
parser.add_argument('--feature-extractor', type=str,
help="name of the feature extractor used for optimization",
default="webface_dense_robust_extract")
return parser.parse_args(argv)
if __name__ == '__main__':
args = parse_arguments(sys.argv[1:])
main()
+95
View File
@@ -0,0 +1,95 @@
import argparse
import os
import pickle
import random
import sys
import numpy as np
from differentiator import FawkesMaskGeneration
from tensorflow import set_random_seed
from utils import load_extractor, CloakData, init_gpu
random.seed(12243)
np.random.seed(122412)
set_random_seed(12242)
NUM_IMG_PROTECTED = 32 # Number of images used to optimize the target class
BATCH_SIZE = 32
MAX_ITER = 1000
def diff_protected_data(sess, feature_extractors_ls, image_X, number_protect, target_X=None, th=0.01):
image_X = image_X[:number_protect]
differentiator = FawkesMaskGeneration(sess, feature_extractors_ls,
batch_size=BATCH_SIZE,
mimic_img=True,
intensity_range='imagenet',
initial_const=args.sd,
learning_rate=args.lr,
max_iterations=MAX_ITER,
l_threshold=th,
verbose=1, maximize=False, keep_final=False, image_shape=image_X.shape[1:])
if len(target_X) < len(image_X):
target_X = np.concatenate([target_X, target_X, target_X])
target_X = target_X[:len(image_X)]
cloaked_image_X = differentiator.attack(image_X, target_X)
return cloaked_image_X
def perform_defense():
RES = {}
sess = init_gpu(args.gpu)
FEATURE_EXTRACTORS = [args.feature_extractor]
RES_DIR = '../results/'
RES['num_img_protected'] = NUM_IMG_PROTECTED
RES['extractors'] = FEATURE_EXTRACTORS
num_protect = NUM_IMG_PROTECTED
print("Loading {} for optimization".format(args.feature_extractor))
feature_extractors_ls = [load_extractor(name) for name in FEATURE_EXTRACTORS]
protect_class = args.protect_class
cloak_data = CloakData(args.dataset, protect_class=protect_class)
RES_FILE_NAME = "{}_{}_protect{}".format(args.dataset, args.feature_extractor, cloak_data.protect_class)
RES_FILE_NAME = os.path.join(RES_DIR, RES_FILE_NAME)
print("Protect Class: ", cloak_data.protect_class)
cloak_data.target_path, cloak_data.target_data = cloak_data.select_target_label(feature_extractors_ls,
FEATURE_EXTRACTORS)
os.makedirs(RES_DIR, exist_ok=True)
os.makedirs(RES_FILE_NAME, exist_ok=True)
cloak_image_X = diff_protected_data(sess, feature_extractors_ls, cloak_data.protect_train_X,
number_protect=num_protect,
target_X=cloak_data.target_data, th=args.th)
cloak_data.cloaked_protect_train_X = cloak_image_X
RES['cloak_data'] = cloak_data
pickle.dump(RES, open(os.path.join(RES_FILE_NAME, 'cloak_data.p'), "wb"))
def parse_arguments(argv):
parser = argparse.ArgumentParser()
parser.add_argument('--gpu', type=str,
help='GPU id', default='0')
parser.add_argument('--dataset', type=str,
help='name of dataset', default='scrub')
parser.add_argument('--feature-extractor', type=str,
help="name of the feature extractor used for optimization",
default="webface_dense_robust_extract")
parser.add_argument('--th', type=float, default=0.007)
parser.add_argument('--sd', type=int, default=1e5)
parser.add_argument('--protect_class', type=str, default=None)
parser.add_argument('--lr', type=float, default=0.1)
return parser.parse_args(argv)
if __name__ == '__main__':
args = parse_arguments(sys.argv[1:])
perform_defense()
+372
View File
@@ -0,0 +1,372 @@
import json
import os
import pickle
import random
import keras
import keras.backend as K
import numpy as np
import tensorflow as tf
from keras.applications.vgg16 import preprocess_input
from keras.layers import Dense, Activation
from keras.models import Model
from keras.preprocessing import image
from keras.utils import to_categorical
from sklearn.metrics import pairwise_distances
def clip_img(X, preprocessing='raw'):
X = reverse_preprocess(X, preprocessing)
X = np.clip(X, 0.0, 255.0)
X = preprocess(X, preprocessing)
return X
def dump_dictionary_as_json(dict, outfile):
j = json.dumps(dict)
with open(outfile, "wb") as f:
f.write(j.encode())
def fix_gpu_memory(mem_fraction=1):
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_fraction)
tf_config = tf.ConfigProto(gpu_options=gpu_options)
tf_config.gpu_options.allow_growth = True
tf_config.log_device_placement = False
init_op = tf.global_variables_initializer()
sess = tf.Session(config=tf_config)
sess.run(init_op)
K.set_session(sess)
return sess
def load_victim_model(number_classes, teacher_model=None, end2end=False):
for l in teacher_model.layers:
l.trainable = end2end
x = teacher_model.layers[-1].output
x = Dense(number_classes)(x)
x = Activation('softmax', name="act")(x)
model = Model(teacher_model.input, x)
opt = keras.optimizers.Adadelta()
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
return model
def init_gpu(gpu_index, force=False):
if isinstance(gpu_index, list):
gpu_num = ','.join([str(i) for i in gpu_index])
else:
gpu_num = str(gpu_index)
if "CUDA_VISIBLE_DEVICES" in os.environ and os.environ["CUDA_VISIBLE_DEVICES"] and not force:
print('GPU already initiated')
return
os.environ["CUDA_VISIBLE_DEVICES"] = gpu_num
sess = fix_gpu_memory()
return sess
def preprocess(X, method):
assert method in {'raw', 'imagenet', 'inception', 'mnist'}
if method is 'raw':
pass
elif method is 'imagenet':
X = imagenet_preprocessing(X)
else:
raise Exception('unknown method %s' % method)
return X
def reverse_preprocess(X, method):
assert method in {'raw', 'imagenet', 'inception', 'mnist'}
if method is 'raw':
pass
elif method is 'imagenet':
X = imagenet_reverse_preprocessing(X)
else:
raise Exception('unknown method %s' % method)
return X
def imagenet_preprocessing(x, data_format=None):
if data_format is None:
data_format = K.image_data_format()
assert data_format in ('channels_last', 'channels_first')
x = np.array(x)
if data_format == 'channels_first':
# 'RGB'->'BGR'
if x.ndim == 3:
x = x[::-1, ...]
else:
x = x[:, ::-1, ...]
else:
# 'RGB'->'BGR'
x = x[..., ::-1]
mean = [103.939, 116.779, 123.68]
std = None
# Zero-center by mean pixel
if data_format == 'channels_first':
if x.ndim == 3:
x[0, :, :] -= mean[0]
x[1, :, :] -= mean[1]
x[2, :, :] -= mean[2]
if std is not None:
x[0, :, :] /= std[0]
x[1, :, :] /= std[1]
x[2, :, :] /= std[2]
else:
x[:, 0, :, :] -= mean[0]
x[:, 1, :, :] -= mean[1]
x[:, 2, :, :] -= mean[2]
if std is not None:
x[:, 0, :, :] /= std[0]
x[:, 1, :, :] /= std[1]
x[:, 2, :, :] /= std[2]
else:
x[..., 0] -= mean[0]
x[..., 1] -= mean[1]
x[..., 2] -= mean[2]
if std is not None:
x[..., 0] /= std[0]
x[..., 1] /= std[1]
x[..., 2] /= std[2]
return x
def imagenet_reverse_preprocessing(x, data_format=None):
import keras.backend as K
x = np.array(x)
if data_format is None:
data_format = K.image_data_format()
assert data_format in ('channels_last', 'channels_first')
if data_format == 'channels_first':
if x.ndim == 3:
# Zero-center by mean pixel
x[0, :, :] += 103.939
x[1, :, :] += 116.779
x[2, :, :] += 123.68
# 'BGR'->'RGB'
x = x[::-1, :, :]
else:
x[:, 0, :, :] += 103.939
x[:, 1, :, :] += 116.779
x[:, 2, :, :] += 123.68
x = x[:, ::-1, :, :]
else:
# Zero-center by mean pixel
x[..., 0] += 103.939
x[..., 1] += 116.779
x[..., 2] += 123.68
# 'BGR'->'RGB'
x = x[..., ::-1]
return x
def build_bottleneck_model(model, cut_off):
bottleneck_model = Model(model.input, model.get_layer(cut_off).output)
bottleneck_model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
return bottleneck_model
def load_extractor(name):
model = keras.models.load_model("../feature_extractors/{}.h5".format(name))
if hasattr(model.layers[-1], "activation") and model.layers[-1].activation == "softmax":
raise Exception(
"Given extractor's last layer is softmax, need to remove the top layers to make it into a feature extractor")
# if "extract" in name.split("/")[-1]:
# pass
# else:
# print("Convert a model to a feature extractor")
# model = build_bottleneck_model(model, model.layers[layer_idx].name)
# model.save(name + "extract")
# model = keras.models.load_model(name + "extract")
return model
def get_dataset_path(dataset):
if not os.path.exists("config.json"):
raise Exception("Please config the datasets before running protection code. See more in README and config.py.")
config = json.load(open("config.json", 'r'))
if dataset not in config:
raise Exception(
"Dataset {} does not exist, please download to data/ and add the path to this function... Abort".format(
dataset))
return config[dataset]['train_dir'], config[dataset]['test_dir'], config[dataset]['num_classes'], config[dataset][
'num_images']
def normalize(x):
return x / np.linalg.norm(x, axis=1, keepdims=True)
class CloakData(object):
def __init__(self, dataset, img_shape=(224, 224), protect_class=None):
self.dataset = dataset
self.img_shape = img_shape
self.train_data_dir, self.test_data_dir, self.number_classes, self.number_samples = get_dataset_path(dataset)
self.all_labels = sorted(list(os.listdir(self.train_data_dir)))
if protect_class:
self.protect_class = protect_class
else:
self.protect_class = random.choice(self.all_labels)
self.sybil_class = random.choice([l for l in self.all_labels if l != self.protect_class])
self.protect_train_X, self.protect_test_X = self.load_label_data(self.protect_class)
self.sybil_train_X, self.sybil_test_X = self.load_label_data(self.sybil_class)
self.cloaked_protect_train_X = None
self.cloaked_sybil_train_X = None
self.label2path_train, self.label2path_test, self.path2idx = self.build_data_mapping()
self.all_training_path = self.get_all_data_path(self.label2path_train)
self.all_test_path = self.get_all_data_path(self.label2path_test)
self.protect_class_path = self.get_class_image_files(os.path.join(self.train_data_dir, self.protect_class))
self.sybil_class_path = self.get_class_image_files(os.path.join(self.train_data_dir, self.sybil_class))
print("Find {} protect images".format(len(self.protect_class_path)))
def get_class_image_files(self, path):
return [os.path.join(path, f) for f in os.listdir(path)]
def extractor_ls_predict(self, feature_extractors_ls, X):
feature_ls = []
for extractor in feature_extractors_ls:
cur_features = extractor.predict(X)
feature_ls.append(cur_features)
concated_feature_ls = np.concatenate(feature_ls, axis=1)
concated_feature_ls = normalize(concated_feature_ls)
return concated_feature_ls
def load_embeddings(self, feature_extractors_names):
dictionaries = []
for extractor_name in feature_extractors_names:
path2emb = pickle.load(open("../feature_extractors/embeddings/{}_emb_norm.p".format(extractor_name), "rb"))
dictionaries.append(path2emb)
merge_dict = {}
for k in dictionaries[0].keys():
cur_emb = [dic[k] for dic in dictionaries]
merge_dict[k] = np.concatenate(cur_emb)
return merge_dict
def select_target_label(self, feature_extractors_ls, feature_extractors_names, metric='l2'):
original_feature_x = self.extractor_ls_predict(feature_extractors_ls, self.protect_train_X)
path2emb = self.load_embeddings(feature_extractors_names)
items = list(path2emb.items())
paths = [p[0] for p in items]
embs = [p[1] for p in items]
embs = np.array(embs)
pair_dist = pairwise_distances(original_feature_x, embs, metric)
max_sum = np.min(pair_dist, axis=0)
sorted_idx = np.argsort(max_sum)[::-1]
highest_num = 0
paired_target_X = None
final_target_class_path = None
for idx in sorted_idx[:5]:
target_class_path = paths[idx]
cur_target_X = self.load_dir(target_class_path)
cur_target_X = np.concatenate([cur_target_X, cur_target_X, cur_target_X])
cur_tot_sum, cur_paired_target_X = self.calculate_dist_score(self.protect_train_X, cur_target_X,
feature_extractors_ls,
metric=metric)
if cur_tot_sum > highest_num:
highest_num = cur_tot_sum
paired_target_X = cur_paired_target_X
final_target_class_path = target_class_path
np.random.shuffle(paired_target_X)
return final_target_class_path, paired_target_X
def calculate_dist_score(self, a, b, feature_extractors_ls, metric='l2'):
features1 = self.extractor_ls_predict(feature_extractors_ls, a)
features2 = self.extractor_ls_predict(feature_extractors_ls, b)
pair_cos = pairwise_distances(features1, features2, metric)
max_sum = np.min(pair_cos, axis=0)
max_sum_arg = np.argsort(max_sum)[::-1]
max_sum_arg = max_sum_arg[:len(a)]
max_sum = [max_sum[i] for i in max_sum_arg]
paired_target_X = [b[j] for j in max_sum_arg]
paired_target_X = np.array(paired_target_X)
return np.min(max_sum), paired_target_X
def get_all_data_path(self, label2path):
all_paths = []
for k, v in label2path.items():
cur_all_paths = [os.path.join(k, cur_p) for cur_p in v]
all_paths.extend(cur_all_paths)
return all_paths
def load_label_data(self, label):
train_label_path = os.path.join(self.train_data_dir, label)
test_label_path = os.path.join(self.test_data_dir, label)
train_X = self.load_dir(train_label_path)
test_X = self.load_dir(test_label_path)
return train_X, test_X
def load_dir(self, path):
assert os.path.exists(path)
x_ls = []
for file in os.listdir(path):
cur_path = os.path.join(path, file)
im = image.load_img(cur_path, target_size=self.img_shape)
im = image.img_to_array(im)
x_ls.append(im)
raw_x = np.array(x_ls)
return preprocess_input(raw_x)
def build_data_mapping(self):
label2path_train = {}
label2path_test = {}
idx = 0
path2idx = {}
for label_name in self.all_labels:
full_path_train = os.path.join(self.train_data_dir, label_name)
full_path_test = os.path.join(self.test_data_dir, label_name)
label2path_train[full_path_train] = list(os.listdir(full_path_train))
label2path_test[full_path_test] = list(os.listdir(full_path_test))
for img_file in os.listdir(full_path_train):
path2idx[os.path.join(full_path_train, img_file)] = idx
for img_file in os.listdir(full_path_test):
path2idx[os.path.join(full_path_test, img_file)] = idx
idx += 1
return label2path_train, label2path_test, path2idx
def generate_data_post_cloak(self, sybil=False):
assert self.cloaked_protect_train_X is not None
while True:
batch_X = []
batch_Y = []
cur_batch_path = random.sample(self.all_training_path, 32)
for p in cur_batch_path:
cur_y = self.path2idx[p]
if p in self.protect_class_path:
cur_x = random.choice(self.cloaked_protect_train_X)
elif sybil and (p in self.sybil_class):
cur_x = random.choice(self.cloaked_sybil_train_X)
else:
im = image.load_img(p, target_size=self.img_shape)
im = image.img_to_array(im)
cur_x = preprocess_input(im)
batch_X.append(cur_x)
batch_Y.append(cur_y)
batch_X = np.array(batch_X)
batch_Y = to_categorical(np.array(batch_Y), num_classes=self.number_classes)
yield batch_X, batch_Y