From d6edd3021df3bd0d63251195c27f0accbbc7e1f9 Mon Sep 17 00:00:00 2001 From: Shawn-Shan Date: Sun, 17 May 2020 19:18:41 -0500 Subject: [PATCH] init commit --- fawkes/differentiator.py | 430 +++++++++++++++++++++++++++ fawkes/encode_utils.py | 617 +++++++++++++++++++++++++++++++++++++++ fawkes/eval_cloak.py | 202 +++++++++++++ fawkes/protection.py | 138 +++++++++ fawkes/utils.py | 410 ++++++++++++++++++++++++++ 5 files changed, 1797 insertions(+) create mode 100644 fawkes/differentiator.py create mode 100644 fawkes/encode_utils.py create mode 100644 fawkes/eval_cloak.py create mode 100644 fawkes/protection.py create mode 100644 fawkes/utils.py diff --git a/fawkes/differentiator.py b/fawkes/differentiator.py new file mode 100644 index 0000000..88e6c5c --- /dev/null +++ b/fawkes/differentiator.py @@ -0,0 +1,430 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# @Date : 2020-05-17 +# @Author : Shawn Shan (shansixiong@cs.uchicago.edu) +# @Link : https://www.shawnshan.com/ + +import datetime +import time +from decimal import Decimal +import numpy as np +import tensorflow as tf +from .utils import preprocess, reverse_preprocess + + +class FawkesMaskGeneration: + # if the attack is trying to mimic a target image or a neuron vector + MIMIC_IMG = True + # number of iterations to perform gradient descent + MAX_ITERATIONS = 10000 + # larger values converge faster to less accurate results + LEARNING_RATE = 1e-2 + # the initial constant c to pick as a first guess + INITIAL_CONST = 1 + # pixel intensity range + INTENSITY_RANGE = 'imagenet' + # threshold for distance + L_THRESHOLD = 0.03 + # whether keep the final result or the best result + KEEP_FINAL = False + # max_val of image + MAX_VAL = 255 + # The following variables are used by DSSIM, should keep as default + # filter size in SSIM + FILTER_SIZE = 11 + # filter sigma in SSIM + FILTER_SIGMA = 1.5 + # weights used in MS-SSIM + SCALE_WEIGHTS = None + MAXIMIZE = False + IMAGE_SHAPE = (224, 224, 3) + RATIO = 1.0 + LIMIT_DIST = False + + def __init__(self, sess, bottleneck_model_ls, mimic_img=MIMIC_IMG, + batch_size=1, learning_rate=LEARNING_RATE, + max_iterations=MAX_ITERATIONS, initial_const=INITIAL_CONST, + intensity_range=INTENSITY_RANGE, l_threshold=L_THRESHOLD, + max_val=MAX_VAL, keep_final=KEEP_FINAL, maximize=MAXIMIZE, image_shape=IMAGE_SHAPE, + verbose=0, ratio=RATIO, limit_dist=LIMIT_DIST): + + assert intensity_range in {'raw', 'imagenet', 'inception', 'mnist'} + + # constant used for tanh transformation to avoid corner cases + self.tanh_constant = 2 - 1e-6 + self.sess = sess + self.MIMIC_IMG = mimic_img + self.LEARNING_RATE = learning_rate + self.MAX_ITERATIONS = max_iterations + self.initial_const = initial_const + self.batch_size = batch_size + self.intensity_range = intensity_range + self.l_threshold = l_threshold + self.max_val = max_val + self.keep_final = keep_final + self.verbose = verbose + self.maximize = maximize + self.learning_rate = learning_rate + self.ratio = ratio + self.limit_dist = limit_dist + self.single_shape = list(image_shape) + + self.input_shape = tuple([self.batch_size] + self.single_shape) + + self.bottleneck_shape = tuple([self.batch_size] + self.single_shape) + + # the variable we're going to optimize over + self.modifier = tf.Variable(np.zeros(self.input_shape, dtype=np.float32)) + + # target image in tanh space + if self.MIMIC_IMG: + self.timg_tanh = tf.Variable(np.zeros(self.input_shape), dtype=np.float32) + else: + self.bottleneck_t_raw = tf.Variable(np.zeros(self.bottleneck_shape), dtype=np.float32) + # source image in tanh space + self.simg_tanh = tf.Variable(np.zeros(self.input_shape), dtype=np.float32) + + self.const = tf.Variable(np.ones(batch_size), dtype=np.float32) + self.mask = tf.Variable(np.ones((batch_size), dtype=np.bool)) + self.weights = tf.Variable(np.ones(self.bottleneck_shape, + dtype=np.float32)) + + # and here's what we use to assign them + self.assign_modifier = tf.placeholder(tf.float32, self.input_shape) + if self.MIMIC_IMG: + self.assign_timg_tanh = tf.placeholder( + tf.float32, self.input_shape) + else: + self.assign_bottleneck_t_raw = tf.placeholder( + tf.float32, self.bottleneck_shape) + self.assign_simg_tanh = tf.placeholder(tf.float32, self.input_shape) + self.assign_const = tf.placeholder(tf.float32, (batch_size)) + self.assign_mask = tf.placeholder(tf.bool, (batch_size)) + self.assign_weights = tf.placeholder(tf.float32, self.bottleneck_shape) + + # the resulting image, tanh'd to keep bounded from -0.5 to 0.5 + # adversarial image in raw space + self.aimg_raw = (tf.tanh(self.modifier + self.simg_tanh) / + self.tanh_constant + + 0.5) * 255.0 + # source image in raw space + self.simg_raw = (tf.tanh(self.simg_tanh) / + self.tanh_constant + + 0.5) * 255.0 + if self.MIMIC_IMG: + # target image in raw space + self.timg_raw = (tf.tanh(self.timg_tanh) / + self.tanh_constant + + 0.5) * 255.0 + + # convert source and adversarial image into input space + if self.intensity_range == 'imagenet': + mean = tf.constant(np.repeat([[[[103.939, 116.779, 123.68]]]], self.batch_size, axis=0), dtype=tf.float32, + name='img_mean') + self.aimg_input = (self.aimg_raw[..., ::-1] - mean) + self.simg_input = (self.simg_raw[..., ::-1] - mean) + if self.MIMIC_IMG: + self.timg_input = (self.timg_raw[..., ::-1] - mean) + + elif self.intensity_range == 'raw': + self.aimg_input = self.aimg_raw + self.simg_input = self.simg_raw + if self.MIMIC_IMG: + self.timg_input = self.timg_raw + + def batch_gen_DSSIM(aimg_raw_split, simg_raw_split): + msssim_split = tf.image.ssim(aimg_raw_split, simg_raw_split, max_val=255.0) + dist = (1.0 - tf.stack(msssim_split)) / 2.0 + return dist + + # raw value of DSSIM distance + self.dist_raw = batch_gen_DSSIM(self.aimg_raw, self.simg_raw) + # distance value after applying threshold + self.dist = tf.maximum(self.dist_raw - self.l_threshold, 0.0) + + self.dist_raw_sum = tf.reduce_sum( + tf.where(self.mask, + self.dist_raw, + tf.zeros_like(self.dist_raw))) + self.dist_sum = tf.reduce_sum(tf.where(self.mask, self.dist, tf.zeros_like(self.dist))) + + def resize_tensor(input_tensor, model_input_shape): + if input_tensor.shape[1:] == model_input_shape or model_input_shape[1] is None: + return input_tensor + resized_tensor = tf.image.resize(input_tensor, model_input_shape[:2]) + return resized_tensor + + def calculate_direction(bottleneck_model, cur_timg_input, cur_simg_input): + target_features = bottleneck_model(cur_timg_input) + return target_features + # target_center = tf.reduce_mean(target_features, axis=0) + # original = bottleneck_model(cur_simg_input) + # original_center = tf.reduce_mean(original, axis=0) + # direction = target_center - original_center + # final_target = original + self.ratio * direction + # return final_target + + self.bottlesim = 0.0 + self.bottlesim_sum = 0.0 + self.bottlesim_push = 0.0 + for bottleneck_model in bottleneck_model_ls: + model_input_shape = bottleneck_model.input_shape[1:] + cur_aimg_input = resize_tensor(self.aimg_input, model_input_shape) + + self.bottleneck_a = bottleneck_model(cur_aimg_input) + if self.MIMIC_IMG: + # cur_timg_input = resize_tensor(self.timg_input, model_input_shape) + # cur_simg_input = resize_tensor(self.simg_input, model_input_shape) + cur_timg_input = self.timg_input + cur_simg_input = self.simg_input + self.bottleneck_t = calculate_direction(bottleneck_model, cur_timg_input, cur_simg_input) + # self.bottleneck_t = bottleneck_model(cur_timg_input) + else: + self.bottleneck_t = self.bottleneck_t_raw + + bottleneck_diff = self.bottleneck_t - self.bottleneck_a + scale_factor = tf.sqrt(tf.reduce_sum(tf.square(self.bottleneck_t), axis=1)) + + cur_bottlesim = tf.sqrt(tf.reduce_sum(tf.square(bottleneck_diff), axis=1)) + cur_bottlesim = cur_bottlesim / scale_factor + cur_bottlesim_sum = tf.reduce_sum(cur_bottlesim) + + self.bottlesim += cur_bottlesim + + # self.bottlesim_push += cur_bottlesim_push_sum + self.bottlesim_sum += cur_bottlesim_sum + + # sum up the losses + if self.maximize: + self.loss = self.const * tf.square(self.dist) - self.bottlesim + else: + self.loss = self.const * tf.square(self.dist) + self.bottlesim + + self.loss_sum = tf.reduce_sum(tf.where(self.mask, self.loss, tf.zeros_like(self.loss))) + + # Setup the Adadelta optimizer and keep track of variables + # we're creating + start_vars = set(x.name for x in tf.global_variables()) + self.learning_rate_holder = tf.placeholder(tf.float32, shape=[]) + # optimizer = tf.train.AdadeltaOptimizer(self.learning_rate_holder) + optimizer = tf.train.AdamOptimizer(self.learning_rate_holder) + + self.train = optimizer.minimize(self.loss_sum, + var_list=[self.modifier]) + end_vars = tf.global_variables() + new_vars = [x for x in end_vars if x.name not in start_vars] + + # these are the variables to initialize when we run + self.setup = [] + self.setup.append(self.modifier.assign(self.assign_modifier)) + if self.MIMIC_IMG: + self.setup.append(self.timg_tanh.assign(self.assign_timg_tanh)) + else: + self.setup.append(self.bottleneck_t_raw.assign( + self.assign_bottleneck_t_raw)) + self.setup.append(self.simg_tanh.assign(self.assign_simg_tanh)) + self.setup.append(self.const.assign(self.assign_const)) + self.setup.append(self.mask.assign(self.assign_mask)) + self.setup.append(self.weights.assign(self.assign_weights)) + + self.init = tf.variables_initializer(var_list=[self.modifier] + new_vars) + + print('Attacker loaded') + + def preprocess_arctanh(self, imgs): + + imgs = reverse_preprocess(imgs, self.intensity_range) + imgs /= 255.0 + imgs -= 0.5 + imgs *= self.tanh_constant + tanh_imgs = np.arctanh(imgs) + + return tanh_imgs + + def clipping(self, imgs): + + imgs = reverse_preprocess(imgs, self.intensity_range) + imgs = np.clip(imgs, 0, self.max_val) + imgs = np.rint(imgs) + + imgs = preprocess(imgs, self.intensity_range) + + return imgs + + def attack(self, source_imgs, target_imgs, weights=None): + + if weights is None: + weights = np.ones([source_imgs.shape[0]] + + list(self.bottleneck_shape[1:])) + + assert weights.shape[1:] == self.bottleneck_shape[1:] + assert source_imgs.shape[1:] == self.input_shape[1:] + assert source_imgs.shape[0] == weights.shape[0] + if self.MIMIC_IMG: + assert target_imgs.shape[1:] == self.input_shape[1:] + assert source_imgs.shape[0] == target_imgs.shape[0] + else: + assert target_imgs.shape[1:] == self.bottleneck_shape[1:] + assert source_imgs.shape[0] == target_imgs.shape[0] + + start_time = time.time() + + adv_imgs = [] + print('%d batches in total' + % int(np.ceil(len(source_imgs) / self.batch_size))) + + for idx in range(0, len(source_imgs), self.batch_size): + print('processing batch %d at %s' % (idx, datetime.datetime.now())) + adv_img = self.attack_batch(source_imgs[idx:idx + self.batch_size], + target_imgs[idx:idx + self.batch_size], + weights[idx:idx + self.batch_size]) + adv_imgs.extend(adv_img) + + elapsed_time = time.time() - start_time + print('attack cost %f s' % (elapsed_time)) + + return np.array(adv_imgs) + + def attack_batch(self, source_imgs, target_imgs, weights): + + """ + Run the attack on a batch of images and labels. + """ + + LR = self.learning_rate + nb_imgs = source_imgs.shape[0] + mask = [True] * nb_imgs + [False] * (self.batch_size - nb_imgs) + mask = np.array(mask, dtype=np.bool) + + source_imgs = np.array(source_imgs) + target_imgs = np.array(target_imgs) + + # convert to tanh-space + simg_tanh = self.preprocess_arctanh(source_imgs) + if self.MIMIC_IMG: + timg_tanh = self.preprocess_arctanh(target_imgs) + else: + timg_tanh = target_imgs + + CONST = np.ones(self.batch_size) * self.initial_const + + self.sess.run(self.init) + simg_tanh_batch = np.zeros(self.input_shape) + if self.MIMIC_IMG: + timg_tanh_batch = np.zeros(self.input_shape) + else: + timg_tanh_batch = np.zeros(self.bottleneck_shape) + weights_batch = np.zeros(self.bottleneck_shape) + simg_tanh_batch[:nb_imgs] = simg_tanh[:nb_imgs] + timg_tanh_batch[:nb_imgs] = timg_tanh[:nb_imgs] + weights_batch[:nb_imgs] = weights[:nb_imgs] + modifier_batch = np.ones(self.input_shape) * 1e-6 + + # set the variables so that we don't have to send them over again + if self.MIMIC_IMG: + self.sess.run(self.setup, + {self.assign_timg_tanh: timg_tanh_batch, + self.assign_simg_tanh: simg_tanh_batch, + self.assign_const: CONST, + self.assign_mask: mask, + self.assign_weights: weights_batch, + self.assign_modifier: modifier_batch}) + else: + # if directly mimicking a vector, use assign_bottleneck_t_raw + # in setup + self.sess.run(self.setup, + {self.assign_bottleneck_t_raw: timg_tanh_batch, + self.assign_simg_tanh: simg_tanh_batch, + self.assign_const: CONST, + self.assign_mask: mask, + self.assign_weights: weights_batch, + self.assign_modifier: modifier_batch}) + + best_bottlesim = [0] * nb_imgs if self.maximize else [np.inf] * nb_imgs + best_adv = np.zeros_like(source_imgs) + + if self.verbose == 1: + loss_sum = float(self.sess.run(self.loss_sum)) + dist_sum = float(self.sess.run(self.dist_sum)) + thresh_over = (dist_sum / self.batch_size / self.l_threshold * 100) + dist_raw_sum = float(self.sess.run(self.dist_raw_sum)) + bottlesim_sum = self.sess.run(self.bottlesim_sum) + print('START: Total loss: %.4E; perturb: %.6f (%.2f%% over, raw: %.6f); sim: %f' + % (Decimal(loss_sum), + dist_sum, + thresh_over, + dist_raw_sum, + bottlesim_sum / nb_imgs)) + + try: + total_distance = [0] * nb_imgs + + if self.limit_dist: + dist_raw_list, bottlesim_list, aimg_input_list = self.sess.run( + [self.dist_raw, + self.bottlesim, + self.aimg_input]) + for e, (dist_raw, bottlesim, aimg_input) in enumerate( + zip(dist_raw_list, bottlesim_list, aimg_input_list)): + if e >= nb_imgs: + break + total_distance[e] = bottlesim + + for iteration in range(self.MAX_ITERATIONS): + + self.sess.run([self.train], feed_dict={self.learning_rate_holder: LR}) + + dist_raw_list, bottlesim_list, aimg_input_list = self.sess.run( + [self.dist_raw, + self.bottlesim, + self.aimg_input]) + for e, (dist_raw, bottlesim, aimg_input) in enumerate( + zip(dist_raw_list, bottlesim_list, aimg_input_list)): + if e >= nb_imgs: + break + if (bottlesim < best_bottlesim[e] and bottlesim > total_distance[e] * 0.1 and ( + not self.maximize)) or ( + bottlesim > best_bottlesim[e] and self.maximize): + best_bottlesim[e] = bottlesim + best_adv[e] = aimg_input + + if iteration != 0 and iteration % (self.MAX_ITERATIONS // 3) == 0: + LR = LR / 2 + print("Learning Rate: ", LR) + # print out the losses every 10% + if iteration % (self.MAX_ITERATIONS // 10) == 0: + if self.verbose == 1: + loss_sum = float(self.sess.run(self.loss_sum)) + dist_sum = float(self.sess.run(self.dist_sum)) + thresh_over = (dist_sum / + self.batch_size / + self.l_threshold * + 100) + dist_raw_sum = float(self.sess.run(self.dist_raw_sum)) + bottlesim_sum = self.sess.run(self.bottlesim_sum) + print('ITER %4d: Total loss: %.4E; perturb: %.6f (%.2f%% over, raw: %.6f); sim: %f' + % (iteration, + Decimal(loss_sum), + dist_sum, + thresh_over, + dist_raw_sum, + bottlesim_sum / nb_imgs)) + except KeyboardInterrupt: + pass + + if self.verbose == 1: + loss_sum = float(self.sess.run(self.loss_sum)) + dist_sum = float(self.sess.run(self.dist_sum)) + thresh_over = (dist_sum / self.batch_size / self.l_threshold * 100) + dist_raw_sum = float(self.sess.run(self.dist_raw_sum)) + bottlesim_sum = float(self.sess.run(self.bottlesim_sum)) + print('END: Total loss: %.4E; perturb: %.6f (%.2f%% over, raw: %.6f); sim: %f' + % (Decimal(loss_sum), + dist_sum, + thresh_over, + dist_raw_sum, + bottlesim_sum / nb_imgs)) + + best_adv = self.clipping(best_adv[:nb_imgs]) + + return best_adv diff --git a/fawkes/encode_utils.py b/fawkes/encode_utils.py new file mode 100644 index 0000000..dcdb3a4 --- /dev/null +++ b/fawkes/encode_utils.py @@ -0,0 +1,617 @@ +import sys + +sys.path.append("/home/shansixioing/tools/") +import gen_utils +import keras, os +from keras.preprocessing import image +from skimage.transform import resize +from sklearn.model_selection import train_test_split +from keras.models import Model +from keras.layers import Input +from keras.layers import Conv2D, MaxPooling2D, Dense, Activation, Layer +import keras.backend as K +import random, pickle +import numpy as np +from keras.preprocessing.image import ImageDataGenerator +from keras.applications.vgg16 import preprocess_input +from sklearn.metrics import pairwise_distances +from keras.utils import to_categorical + + +def load_dataset_deepid(full=False, num_classes=1283, preprocess='raw'): + if not full: + X_train, Y_train = gen_utils.load_h5py(["X_train", "Y_train"], + "/mnt/data/sixiongshan/backdoor/data/deepid/deepid_data_training_0.h5") + else: + X_train_0, Y_train_0 = gen_utils.load_h5py(["X_train", "Y_train"], + "/mnt/data/sixiongshan/backdoor/data/deepid/deepid_data_training_0.h5") + + X_train_1, Y_train_1 = gen_utils.load_h5py(["X_train", "Y_train"], + "/mnt/data/sixiongshan/backdoor/data/deepid/deepid_data_training_1.h5") + + X_train_2, Y_train_2 = gen_utils.load_h5py(["X_train", "Y_train"], + "/mnt/data/sixiongshan/backdoor/data/deepid/deepid_data_training_2.h5") + + X_train_3, Y_train_3 = gen_utils.load_h5py(["X_train", "Y_train"], + "/mnt/data/sixiongshan/backdoor/data/deepid/deepid_data_training_3.h5") + + X_train = np.concatenate([X_train_0, X_train_1, X_train_2, X_train_3]) + Y_train = np.concatenate([Y_train_0, Y_train_1, Y_train_2, Y_train_3]) + + X_test, Y_test = gen_utils.load_h5py(["X_test", "Y_test"], + "/mnt/data/sixiongshan/backdoor/data/deepid/deepid_data_testing.h5") + + X_train = utils_keras.preprocess(X_train, preprocess) + X_test = utils_keras.preprocess(X_test, preprocess) + + return X_train, Y_train, X_test, Y_test + + +def load_dataset(data_file): + dataset = utils_keras.load_dataset(data_file) + + X_train = dataset['X_train'] + Y_train = dataset['Y_train'] + X_test = dataset['X_test'] + Y_test = dataset['Y_test'] + + return X_train, Y_train, X_test, Y_test + + +def load_extractor(name, all_layers=False): + if name is None: + return + m = keras.models.load_model("/home/shansixioing/cloak/models/extractors/{}_extract.h5".format(name)) + if all_layers: + if name == 'vggface1': + target_layers = ['conv4_3', 'conv5_1', 'conv5_2', 'conv5_3', 'flatten', 'fc6', 'fc7'] + extractor = Model(inputs=m.layers[0].input, + outputs=[m.get_layer(l).output for l in target_layers]) + + return m + + +def transfer_learning_model(teacher_model, number_classes): + for l in teacher_model.layers: + l.trainable = False + x = teacher_model.layers[-1].output + x = Dense(number_classes)(x) + x = Activation('softmax', name="act")(x) + model = Model(teacher_model.input, x) + + opt = keras.optimizers.Adadelta() + model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) + return model + + +def clip_img(X, preprocessing='raw'): + X = utils_keras.reverse_preprocess(X, preprocessing) + X = np.clip(X, 0.0, 255.0) + X = utils_keras.preprocess(X, preprocessing) + return X + + +def get_dataset_path(dataset): + if dataset == "webface": + train_data_dir = '/mnt/data/sixiongshan/data/webface/train' + test_data_dir = '/mnt/data/sixiongshan/data/webface/test' + number_classes = 10575 + number_samples = 475137 + + elif dataset == "vggface1": + train_data_dir = '/mnt/data/sixiongshan/data/vggface/train' + test_data_dir = '/mnt/data/sixiongshan/data/vggface/test' + number_classes = 2622 + number_samples = 1716436 // 3 + + elif dataset == "vggface2": + train_data_dir = '/mnt/data/sixiongshan/data/vggface2/train' + test_data_dir = '/mnt/data/sixiongshan/data/vggface2/test' + number_classes = 8631 + number_samples = 3141890 // 3 + + elif dataset == "scrub": + train_data_dir = '/mnt/data/sixiongshan/data/facescrub/keras_flow_dir/train' + test_data_dir = '/mnt/data/sixiongshan/data/facescrub/keras_flow_dir/test' + number_classes = 530 + number_samples = 57838 + + elif dataset == "youtubeface": + train_data_dir = '/mnt/data/sixiongshan/data/youtubeface/keras_flow_data/train_mtcnnpy_224' + test_data_dir = '/mnt/data/sixiongshan/data/youtubeface/keras_flow_data/test_mtcnnpy_224' + number_classes = 1283 + number_samples = 587137 // 5 + + elif dataset == "emily": + train_data_dir = '/mnt/data/sixiongshan/data/emface/train' + test_data_dir = '/mnt/data/sixiongshan/data/emface/test' + number_classes = 66 + number_samples = 6070 + + elif dataset == "pubfig": + train_data_dir = '/mnt/data/sixiongshan/data/pubfig/train' + test_data_dir = '/mnt/data/sixiongshan/data/pubfig/test' + number_classes = 65 + number_samples = 5979 + + elif dataset == "iris": + train_data_dir = '/mnt/data/sixiongshan/data/iris/train' + test_data_dir = '/mnt/data/sixiongshan/data/iris/test' + number_classes = 1000 + number_samples = 14000 + else: + print("Dataset {} does not exist... Abort".format(dataset)) + exit(1) + + return train_data_dir, test_data_dir, number_classes, number_samples + + +def large_dataset_loader(dataset, augmentation=False, test_only=False, image_size=(224, 224)): + train_data_dir, test_data_dir, number_classes, number_samples = get_dataset_path(dataset) + train_generator, test_generator = generator_wrap(train_data_dir=train_data_dir, test_data_dir=test_data_dir, + augmentation=augmentation, + test_only=test_only, image_size=image_size) + return train_generator, test_generator, number_classes, number_samples + + +def sample_from_generator(gen, nb_sample): + x_test, y_test = gen.next() + X_sample = np.zeros((0, x_test.shape[1], x_test.shape[2], x_test.shape[3])) + Y_sample = np.zeros((0, y_test.shape[1])) + + while X_sample.shape[0] < nb_sample: + x, y = gen.next() + X_sample = np.concatenate((X_sample, x), axis=0) + Y_sample = np.concatenate((Y_sample, y), axis=0) + + X_sample = X_sample[:nb_sample] + Y_sample = Y_sample[:nb_sample] + + return X_sample, Y_sample + + +def generator_wrap(train_data_dir=None, test_data_dir=None, augmentation=False, test_only=False, image_size=(224, 224)): + if not test_data_dir: + validation_split = 0.05 + else: + validation_split = 0 + if augmentation: + data_gen = ImageDataGenerator( + preprocessing_function=preprocess_input, + rotation_range=20, + width_shift_range=0.15, + height_shift_range=0.15, + shear_range=0., + zoom_range=0.15, + channel_shift_range=0., + fill_mode='nearest', + cval=0., + horizontal_flip=True, validation_split=validation_split) + else: + data_gen = ImageDataGenerator(preprocessing_function=preprocess_input, validation_split=validation_split) + + if test_data_dir is None: + train_generator = data_gen.flow_from_directory( + train_data_dir, + target_size=image_size, + batch_size=32, subset='training') + test_generator = data_gen.flow_from_directory( + train_data_dir, + target_size=image_size, + batch_size=32, subset='validation') + else: + if test_only: + train_generator = None + else: + train_generator = data_gen.flow_from_directory( + train_data_dir, + target_size=image_size, + batch_size=32) + test_generator = data_gen.flow_from_directory( + test_data_dir, + target_size=image_size, + batch_size=32) + + return train_generator, test_generator + + +class MergeLayer(Layer): + + def __init__(self, **kwargs): + self.result = None + super(MergeLayer, self).__init__(**kwargs) + + def build(self, input_shape): + # Create a trainable weight variable for this layer. + kernel_1_shape = (5 * 4 * 60, 160) + kernel_2_shape = (4 * 3 * 80, 160) + bias_shape = (160,) + self.kernel_1 = self.add_weight(name='kernel_1', + shape=kernel_1_shape, + initializer='uniform', + trainable=True) + self.kernel_2 = self.add_weight(name='kernel_2', + shape=kernel_2_shape, + initializer='uniform', + trainable=True) + self.bias = self.add_weight(name='bias', + shape=bias_shape, + initializer='uniform', + trainable=True) + super(MergeLayer, self).build(input_shape) # Be sure to call this at the end + + def call(self, x): + layer1 = x[0] + layer2 = x[1] + layer1_r = K.reshape(layer1, (-1, 5 * 4 * 60)) + layer2_r = K.reshape(layer2, (-1, 4 * 3 * 80)) + self.result = K.dot(layer1_r, self.kernel_1) + \ + K.dot(layer2_r, self.kernel_2) + self.bias + return self.result + + def compute_output_shape(self, input_shape): + return K.int_shape(self.result) + + +def load_deepid_model(class_num): + input_shape = (55, 47, 3) + + img_input = Input(shape=input_shape) + h1 = Conv2D(20, (4, 4), strides=(1, 1), padding='valid', name='conv_1')(img_input) + h1 = Activation('relu')(h1) + h1 = MaxPooling2D((2, 2), strides=(2, 2), name='pool_1')(h1) + + h2 = Conv2D(40, (3, 3), strides=(1, 1), padding='valid', name='conv_2')(h1) + h2 = Activation('relu')(h2) + h2 = MaxPooling2D((2, 2), strides=(2, 2), name='pool_2')(h2) + + h3 = Conv2D(60, (3, 3), strides=(1, 1), padding='valid', name='conv_3')(h2) + h3 = Activation('relu')(h3) + h3 = MaxPooling2D((2, 2), strides=(2, 2), name='pool_3')(h3) + + h4 = Conv2D(80, (2, 2), strides=(1, 1), padding='valid', name='conv_4')(h3) + h4 = Activation('relu')(h4) + + h5 = MergeLayer()([h3, h4]) + h5 = Activation('relu')(h5) + + h5 = Dense(class_num, name='fc')(h5) + h5 = Activation('softmax')(h5) + + inputs = img_input + model = Model(inputs, h5, name='vgg_face') + return model + + +def get_label_data(X, Y, target): + X_filter = np.array(X) + Y_filter = np.array(Y) + remain_idx = np.argmax(Y, axis=1) == target + X_filter = X_filter[remain_idx] + Y_filter = Y_filter[remain_idx] + return X_filter, Y_filter + + +def get_other_label_data(X, Y, target): + X_filter = np.array(X) + Y_filter = np.array(Y) + remain_idx = np.argmax(Y, axis=1) != target + X_filter = X_filter[remain_idx] + Y_filter = Y_filter[remain_idx] + return X_filter, Y_filter + + +def get_labels_data(X, Y, target_ls): + assert isinstance(target_ls, list) + X_filter = np.array(X) + Y_filter = np.array(Y) + remain_idx = np.array([False] * len(Y_filter)) + for target in target_ls: + cur_remain_idx = np.argmax(Y, axis=1) == target + remain_idx = np.logical_or(remain_idx, cur_remain_idx) + + X_filter = X_filter[remain_idx] + Y_filter = Y_filter[remain_idx] + return X_filter, Y_filter + + +def get_other_labels_data_except(X, Y, target_ls): + assert isinstance(target_ls, list) + + X_filter = np.array(X) + Y_filter = np.array(Y) + remain_idx = np.array([True] * len(Y_filter)) + for target in target_ls: + cur_remain_idx = np.argmax(Y, axis=1) != target + remain_idx = np.logical_and(remain_idx, cur_remain_idx) + + X_filter = X_filter[remain_idx] + Y_filter = Y_filter[remain_idx] + return X_filter, Y_filter + + +def get_bottom_top_model(model, layer_name): + layer = model.get_layer(layer_name) + bottom_input = Input(model.input_shape[1:]) + bottom_output = bottom_input + top_input = Input(layer.output_shape[1:]) + top_output = top_input + + bottom = True + for layer in model.layers: + if bottom: + bottom_output = layer(bottom_output) + else: + top_output = layer(top_output) + if layer.name == layer_name: + bottom = False + + bottom_model = Model(bottom_input, bottom_output) + top_model = Model(top_input, top_output) + + return bottom_model, top_model + + +def load_end2end_model(arch, number_classes): + if arch == 'resnet': + MODEL = keras.applications.resnet_v2.ResNet152V2(include_top=False, weights='imagenet', pooling='avg', + input_shape=(224, 224, 3)) + elif arch == 'inception': + MODEL = keras.applications.InceptionResNetV2(include_top=False, weights='imagenet', pooling='avg', + input_shape=(224, 224, 3)) + elif arch == 'mobile': + MODEL = keras.applications.mobilenet_v2.MobileNetV2(include_top=False, weights='imagenet', pooling='avg', + input_shape=(224, 224, 3)) + elif arch == 'dense': + MODEL = keras.applications.densenet.DenseNet121(include_top=False, weights='imagenet', pooling='avg', + input_shape=(224, 224, 3)) + + model = load_victim_model(number_classes, MODEL, end2end=True) + return model + + +def load_victim_model(number_classes, teacher_model=None, end2end=False): + for l in teacher_model.layers: + l.trainable = end2end + x = teacher_model.layers[-1].output + + x = Dense(number_classes)(x) + x = Activation('softmax', name="act")(x) + model = Model(teacher_model.input, x) + opt = keras.optimizers.Adadelta() + model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) + return model + + +def add_last_layer(number_classes, teacher_model, cut_to_layer=None): + for l in teacher_model.layers: + l.trainable = False + + if cut_to_layer: + x = teacher_model.layers[cut_to_layer].output + print(teacher_model.layers[cut_to_layer].name) + else: + x = teacher_model.layers[-1].output + + x = Dense(number_classes, name='softmax')(x) + x = Activation('softmax', name="act")(x) + model = Model(teacher_model.input, x) + + opt = keras.optimizers.Adadelta() + model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) + return model + + +def resize_batch(x, target_size=(224, 224), intensity="imagenet"): + if x.shape[:2] == target_size: + return x + + x = utils_keras.reverse_preprocess(x, intensity) + resized = np.array([resize(a, target_size) for a in x]) + return utils_keras.preprocess(resized, intensity) + + +def build_bottleneck_model(model, cut_off): + bottleneck_model = Model(model.input, model.get_layer(cut_off).output) + bottleneck_model.compile(loss='categorical_crossentropy', + optimizer='adam', + metrics=['accuracy']) + + return bottleneck_model + + +def split_dataset(X, y, ratio=0.3): + x_appro, x_later, y_appro, y_later = train_test_split(X, y, test_size=ratio, random_state=0) + return x_appro, x_later, y_appro, y_later + + +def data_generator(X, Y, batch_size=32, target_size=(224, 224), intensity='imagenet'): + data_gen = ImageDataGenerator() + data_gen = data_gen.flow(X, Y, batch_size=batch_size) + while True: + cur_X, cur_Y = next(data_gen) + cur_X = resize_batch(cur_X, target_size=target_size, intensity=intensity) + yield np.array(cur_X), cur_Y + + +def evaluate(model, X_test, Y_test, batch_size=32, target_size=(224, 224)): + test_other_gen = data_generator(X_test, Y_test, batch_size=batch_size, target_size=target_size) + if len(X_test) < batch_size * 2: + batch_size = 1 + test_other_step = len(X_test) // batch_size // 2 + acc = model.evaluate_generator(test_other_gen, steps=test_other_step, verbose=0)[1] + return acc + + +def normalize(x): + return x / np.linalg.norm(x, axis=1, keepdims=True) + + +class CloakData(object): + def __init__(self, dataset, img_shape=(224, 224), target_selection_tries=30, protect_class=None): + self.dataset = dataset + self.img_shape = img_shape + self.target_selection_tries = target_selection_tries + + self.train_data_dir, self.test_data_dir, self.number_classes, self.number_samples = get_dataset_path(dataset) + self.all_labels = sorted(list(os.listdir(self.train_data_dir))) + if protect_class: + self.protect_class = protect_class + else: + self.protect_class = random.choice(self.all_labels) + + self.sybil_class = random.choice([l for l in self.all_labels if l != self.protect_class]) + print("Protect label: {} | Sybil label: {}".format(self.protect_class, self.sybil_class)) + self.protect_train_X, self.protect_test_X = self.load_label_data(self.protect_class) + self.sybil_train_X, self.sybil_test_X = self.load_label_data(self.sybil_class) + # self.target_path, self.target_data = self.select_target_label() + + self.cloaked_protect_train_X = None + self.cloaked_sybil_train_X = None + + self.label2path_train, self.label2path_test, self.path2idx = self.build_data_mapping() + self.all_training_path = self.get_all_data_path(self.label2path_train) + self.all_test_path = self.get_all_data_path(self.label2path_test) + self.protect_class_path = self.get_class_image_files(os.path.join(self.train_data_dir, self.protect_class)) + self.sybil_class_path = self.get_class_image_files(os.path.join(self.train_data_dir, self.sybil_class)) + + print( + "Find {} protect images | {} sybil images".format(len(self.protect_class_path), len(self.sybil_class_path))) + + def get_class_image_files(self, path): + return [os.path.join(path, f) for f in os.listdir(path)] + + def extractor_ls_predict(self, feature_extractors_ls, X): + feature_ls = [] + for extractor in feature_extractors_ls: + cur_features = extractor.predict(X) + feature_ls.append(cur_features) + concated_feature_ls = np.concatenate(feature_ls, axis=1) + concated_feature_ls = normalize(concated_feature_ls) + return concated_feature_ls + + def load_embeddings(self, feature_extractors_names): + dictionaries = [] + + for extractor_name in feature_extractors_names: + path2emb = pickle.load(open("/home/shansixioing/cloak/embs/{}_emb_norm.p".format(extractor_name), "rb")) + # path2emb = pickle.load(open("/home/shansixioing/cloak/embs/vggface2_inception_emb.p".format(extractor_name), "rb")) + dictionaries.append(path2emb) + merge_dict = {} + for k in dictionaries[0].keys(): + cur_emb = [dic[k] for dic in dictionaries] + merge_dict[k] = np.concatenate(cur_emb) + return merge_dict + + def select_target_label(self, feature_extractors_ls, feature_extractors_names, metric='l2'): + # original_feature_x = extractor.predict(self.protect_train_X) + original_feature_x = self.extractor_ls_predict(feature_extractors_ls, self.protect_train_X) + + path2emb = self.load_embeddings(feature_extractors_names) + # items = list(path2emb.items()) + teacher_dataset = feature_extractors_names[0].split("_")[0] + # items = [(k, v) for k, v in path2emb.items() if teacher_dataset in k] + items = list(path2emb.items()) + paths = [p[0] for p in items] + embs = [p[1] for p in items] + embs = np.array(embs) + + pair_dist = pairwise_distances(original_feature_x, embs, 'l2') + max_sum = np.min(pair_dist, axis=0) + sorted_idx = np.argsort(max_sum)[::-1] + + highest_num = 0 + paired_target_X = None + final_target_class_path = None + for idx in sorted_idx[:2]: + target_class_path = paths[idx] + cur_target_X = self.load_dir(target_class_path) + cur_target_X = np.concatenate([cur_target_X, cur_target_X, cur_target_X]) + + cur_tot_sum, cur_paired_target_X = self.calculate_dist_score(self.protect_train_X, cur_target_X, + feature_extractors_ls, + metric=metric) + if cur_tot_sum > highest_num: + highest_num = cur_tot_sum + paired_target_X = cur_paired_target_X + final_target_class_path = target_class_path + + np.random.shuffle(paired_target_X) + return final_target_class_path, paired_target_X + + def calculate_dist_score(self, a, b, feature_extractors_ls, metric='l2'): + features1 = self.extractor_ls_predict(feature_extractors_ls, a) + features2 = self.extractor_ls_predict(feature_extractors_ls, b) + + pair_cos = pairwise_distances(features1, features2, metric) + max_sum = np.min(pair_cos, axis=0) + max_sum_arg = np.argsort(max_sum)[::-1] + max_sum_arg = max_sum_arg[:len(a)] + max_sum = [max_sum[i] for i in max_sum_arg] + paired_target_X = [b[j] for j in max_sum_arg] + paired_target_X = np.array(paired_target_X) + return np.min(max_sum), paired_target_X + + def get_all_data_path(self, label2path): + all_paths = [] + for k, v in label2path.items(): + cur_all_paths = [os.path.join(k, cur_p) for cur_p in v] + all_paths.extend(cur_all_paths) + return all_paths + + def load_label_data(self, label): + train_label_path = os.path.join(self.train_data_dir, label) + test_label_path = os.path.join(self.test_data_dir, label) + train_X = self.load_dir(train_label_path) + test_X = self.load_dir(test_label_path) + return train_X, test_X + + def load_dir(self, path): + assert os.path.exists(path) + x_ls = [] + for file in os.listdir(path): + cur_path = os.path.join(path, file) + im = image.load_img(cur_path, target_size=self.img_shape) + im = image.img_to_array(im) + x_ls.append(im) + raw_x = np.array(x_ls) + return preprocess_input(raw_x) + + def build_data_mapping(self): + label2path_train = {} + label2path_test = {} + idx = 0 + path2idx = {} + for label_name in self.all_labels: + full_path_train = os.path.join(self.train_data_dir, label_name) + full_path_test = os.path.join(self.test_data_dir, label_name) + label2path_train[full_path_train] = list(os.listdir(full_path_train)) + label2path_test[full_path_test] = list(os.listdir(full_path_test)) + for img_file in os.listdir(full_path_train): + path2idx[os.path.join(full_path_train, img_file)] = idx + for img_file in os.listdir(full_path_test): + path2idx[os.path.join(full_path_test, img_file)] = idx + idx += 1 + return label2path_train, label2path_test, path2idx + + def generate_data_post_cloak(self, sybil=False): + assert self.cloaked_protect_train_X is not None + while True: + batch_X = [] + batch_Y = [] + cur_batch_path = random.sample(self.all_training_path, 32) + for p in cur_batch_path: + cur_y = self.path2idx[p] + if p in self.protect_class_path: + cur_x = random.choice(self.cloaked_protect_train_X) + elif sybil and (p in self.sybil_class): + cur_x = random.choice(self.cloaked_sybil_train_X) + else: + im = image.load_img(p, target_size=self.img_shape) + im = image.img_to_array(im) + cur_x = preprocess_input(im) + batch_X.append(cur_x) + batch_Y.append(cur_y) + batch_X = np.array(batch_X) + batch_Y = to_categorical(np.array(batch_Y), num_classes=self.number_classes) + yield batch_X, batch_Y diff --git a/fawkes/eval_cloak.py b/fawkes/eval_cloak.py new file mode 100644 index 0000000..0c3ca2e --- /dev/null +++ b/fawkes/eval_cloak.py @@ -0,0 +1,202 @@ +import sys + +sys.path.append("/home/shansixioing/tools/") +sys.path.append("/home/shansixioing/cloak/") + +import argparse +import gen_utils +from tensorflow import set_random_seed +from encode_utils import * +import random +import pickle +import re +import locale + +loc = locale.getlocale() +locale.setlocale(locale.LC_ALL, loc) + +SEEDS = [12345, 23451, 34512, 45123, 51234, 54321, 43215, 32154, 21543, 15432] +IMG_SHAPE = [224, 224, 3] + +MODEL = { + 'vggface1_inception': "0", + 'vggface1_dense': "1", + "vggface2_inception": "2", + "vggface2_dense": "3", + "webface_dense": "4", + "webface_inception": "5", +} + +RES_DIR = '/home/shansixioing/cloak/results/' + + +def select_samples(data_dir): + all_data_path = [] + for cls in os.listdir(data_dir): + cls_dir = os.path.join(data_dir, cls) + for data_path in os.listdir(cls_dir): + all_data_path.append(os.path.join(cls_dir, data_path)) + + return all_data_path + + +def generator_wrap(cloak_data, n_uncloaked, n_classes, test=False, validation_split=0.1): + if test: + # all_data_path = cloak_data.all_test_path + all_data_path = select_samples(cloak_data.test_data_dir) + else: + # all_data_path = cloak_data.all_training_path + all_data_path = select_samples(cloak_data.train_data_dir) + split = int(len(cloak_data.cloaked_protect_train_X) * (1 - validation_split)) + cloaked_train_X = cloak_data.cloaked_protect_train_X[:split] + if cloak_data.cloaked_sybil_train_X is not None: + cloaked_sybil_X = cloak_data.cloaked_sybil_train_X #[:args.number_sybil * 131] + # + # for _ in range(len(cloaked_sybil_X) - 131): + # all_data_path.append(cloak_data.sybil_class_path[0]) + + # random seed for selecting uncloaked pictures + np.random.seed(12345) + uncloaked_path = np.random.choice(cloak_data.protect_class_path, n_uncloaked).tolist() + + while True: + batch_X = [] + batch_Y = [] + cur_batch_path = np.random.choice(all_data_path, args.batch_size) + for p in cur_batch_path: + cur_y = cloak_data.path2idx[p] + # protect class and sybil class do not need to appear in test dataset + if test and (re.search(cloak_data.protect_class, p) or re.search(cloak_data.sybil_class, p)): + continue + # protect class images in train dataset + elif p in cloak_data.protect_class_path and p not in uncloaked_path: + cur_x = random.choice(cloaked_train_X) + # sybil class in train dataset + elif p in cloak_data.sybil_class_path and cloak_data.cloaked_sybil_train_X is not None: + cur_x = random.choice(cloaked_sybil_X) + else: + im = image.load_img(p, target_size=cloak_data.img_shape) + im = image.img_to_array(im) + cur_x = preprocess_input(im) + batch_X.append(cur_x) + batch_Y.append(cur_y) + + batch_X = np.array(batch_X) + batch_Y = to_categorical(np.array(batch_Y), num_classes=n_classes) + + yield batch_X, batch_Y + + +def eval_uncloaked_test_data(cloak_data, n_classes): + original_label = cloak_data.path2idx[list(cloak_data.protect_class_path)[0]] + protect_test_X = cloak_data.protect_test_X + original_Y = [original_label] * len(protect_test_X) + original_Y = to_categorical(original_Y, n_classes) + return protect_test_X, original_Y + + +def eval_cloaked_test_data(cloak_data, n_classes, validation_split=0.1): + split = int(len(cloak_data.cloaked_protect_train_X) * (1 - validation_split)) + cloaked_test_X = cloak_data.cloaked_protect_train_X[split:] + original_label = cloak_data.path2idx[list(cloak_data.protect_class_path)[0]] + original_Y = [original_label] * len(cloaked_test_X) + original_Y = to_categorical(original_Y, n_classes) + return cloaked_test_X, original_Y + + +def main(): + SEED = SEEDS[args.seed_idx] + random.seed(SEED) + set_random_seed(SEED) + gen_utils.init_gpu(args.gpu) + + if args.dataset == 'pubfig': + N_CLASSES = 65 + CLOAK_DIR = "{}_tm{}_tgt57_r1.0_th{}".format(args.dataset, args.model_idx, args.th) + elif args.dataset == 'scrub': + N_CLASSES = 530 + CLOAK_DIR = "{}_tm{}_tgtPatrick_Dempsey_r1.0_th{}_joint".format(args.dataset, args.model_idx, args.th) + elif args.dataset == 'webface': + N_CLASSES = 10575 + CLOAK_DIR = "{}_tm{}_tgt1640351_r1.0_th0.01/".format(args.dataset, args.model_idx) + else: + raise ValueError + print(CLOAK_DIR) + + CLOAK_DIR = os.path.join(RES_DIR, CLOAK_DIR) + RES = pickle.load(open(os.path.join(CLOAK_DIR, "cloak_data.p"), 'rb')) + + print("Build attacker's model") + cloak_data = RES['cloak_data'] + EVAL_RES = {} + train_generator = generator_wrap(cloak_data, n_uncloaked=args.n_uncloaked, n_classes=N_CLASSES, + validation_split=args.validation_split) + test_generator = generator_wrap(cloak_data, test=True, n_uncloaked=args.n_uncloaked, n_classes=N_CLASSES, + validation_split=args.validation_split) + EVAL_RES['transfer_model'] = args.transfer_model + if args.end2end: + model = load_end2end_model("dense", N_CLASSES) + else: + base_model = load_extractor(args.transfer_model) + model = load_victim_model(teacher_model=base_model, number_classes=N_CLASSES) + + original_X, original_Y = eval_uncloaked_test_data(cloak_data, N_CLASSES) + cloaked_test_X, cloaked_test_Y = eval_cloaked_test_data(cloak_data, N_CLASSES, + validation_split=args.validation_split) + + model.fit_generator(train_generator, steps_per_epoch=cloak_data.number_samples // 32, + validation_data=(original_X, original_Y), epochs=args.n_epochs, verbose=2, + use_multiprocessing=True, workers=3) + + _, acc_original = model.evaluate(original_X, original_Y, verbose=0) + print("Accuracy on uncloaked/original images TEST: {:.4f}".format(acc_original)) + EVAL_RES['acc_original'] = acc_original + + _, acc_cloaked = model.evaluate(cloaked_test_X, cloaked_test_Y, verbose=0) + print("Accuracy on cloaked images TEST: {:.4f}".format(acc_cloaked)) + EVAL_RES['acc_cloaked'] = acc_cloaked + + # pred = model.predict_generator(test_generator, verbose=0, steps=10) + # pred = np.argmax(pred, axis=1) + # print(pred) + _, other_acc = model.evaluate_generator(test_generator, verbose=0, steps=50) + print("Accuracy on other classes {:.4f}".format(other_acc)) + EVAL_RES['other_acc'] = other_acc + gen_utils.dump_dictionary_as_json(EVAL_RES, + os.path.join(CLOAK_DIR, "{}_eval_sybil_uncloaked{}_seed{}_th{}.json".format( + args.transfer_model, args.end2end, args.seed_idx, args.th))) + + +def parse_arguments(argv): + parser = argparse.ArgumentParser() + + parser.add_argument('--gpu', type=str, + help='GPU id', default='1') + parser.add_argument('--n_uncloaked', type=int, + help='number of uncloaked images', default=0) + parser.add_argument('--seed_idx', type=int, + help='random seed index', default=0) + parser.add_argument('--dataset', type=str, + help='name of dataset', default='pubfig') + parser.add_argument('--model_idx', type=str, + help='teacher model index', default="2") + parser.add_argument('--transfer_model', type=str, + help='student model', default='vggface2_inception') + parser.add_argument('--end2end', type=int, + help='whether use end2end', default=0) + parser.add_argument('--batch_size', type=int, default=32) + parser.add_argument('--validation_split', type=float, default=0.1) + parser.add_argument('--use_sybil', type=int, + help='whether use sybil class', default=0) + parser.add_argument('--number_sybil', type=int, + help='whether use sybil class', default=1) + parser.add_argument('--n_epochs', type=int, default=3) + parser.add_argument('--th', type=float, default=0.01) + parser.add_argument('--limit', type=int, default=0) + return parser.parse_args(argv) + + +if __name__ == '__main__': + args = parse_arguments(sys.argv[1:]) + main() +# python3 eval_cloak.py --gpu 2 --n_uncloaked 0 --dataset pubfig --model_idx 5 --transfer_model webface_inception \ No newline at end of file diff --git a/fawkes/protection.py b/fawkes/protection.py new file mode 100644 index 0000000..201cb17 --- /dev/null +++ b/fawkes/protection.py @@ -0,0 +1,138 @@ +import sys + +sys.path.append("/home/shansixioing/tools/") +sys.path.append("/home/shansixioing/cloak/") + +import argparse +from tensorflow import set_random_seed +from .differentiator import FawkesMaskGeneration +import os +import numpy as np +import random +import pickle +from .utils import load_extractor, CloakData, init_gpu + +random.seed(12243) +np.random.seed(122412) +set_random_seed(12242) + +SYBIL_ONLY = False + +NUM_IMG_PROTECTED = 20 # Number of images used to optimize the target class +BATCH_SIZE = 20 + +MODEL_IDX = { + 'vggface1_inception': "0", + 'vggface1_dense': "1", + "vggface2_inception": "2", + "vggface2_dense": "3", + "webface_dense": "4", + "webface_inception": "5", +} + +IDX2MODEL = {v: k for k, v in MODEL_IDX.items()} + +IMG_SHAPE = [224, 224, 3] + +GLOBAL_MASK = 0 + +MAXIMIZE = False +MAX_ITER = 500 +INITIAL_CONST = 1e6 +LR = 0.1 + + +def diff_protected_data(sess, feature_extractors_ls, image_X, number_protect, target_X=None, sybil=False, th=0.01): + image_X = image_X[:number_protect] + + differentiator = FawkesMaskGeneration(sess, feature_extractors_ls, + batch_size=BATCH_SIZE, + mimic_img=True, + intensity_range='imagenet', + initial_const=INITIAL_CONST, + learning_rate=LR, + max_iterations=MAX_ITER, + l_threshold=th, + verbose=1, maximize=False, keep_final=False, image_shape=image_X.shape[1:]) + + if len(target_X) < len(image_X): + target_X = np.concatenate([target_X, target_X, target_X, target_X, target_X]) + target_X = target_X[:len(image_X)] + cloaked_image_X = differentiator.attack(image_X, target_X) + return cloaked_image_X + + +def save_results(RES, path): + pickle.dump(RES, open(path, "wb")) + + +def perform_defense(): + RES = {} + sess = init_gpu(args.gpu) + DSSIM_THRESHOLD = args.th + + FEATURE_EXTRACTORS = [IDX2MODEL[args.model_idx]] + MODEL_HASH = "".join(MODEL_IDX[m] for m in FEATURE_EXTRACTORS) + + RES_DIR = '../results/' + RES['num_img_protected'] = NUM_IMG_PROTECTED + RES['extractors'] = FEATURE_EXTRACTORS + num_protect = NUM_IMG_PROTECTED + + print(FEATURE_EXTRACTORS) + feature_extractors_ls = [load_extractor(name) for name in FEATURE_EXTRACTORS] + + protect_class = args.protect_class + + cloak_data = CloakData(args.dataset, target_selection_tries=1, protect_class=protect_class) + print("Protect Class: ", cloak_data.protect_class) + + if "robust" in FEATURE_EXTRACTORS[0]: + non_robust = MODEL_IDX["_".join(FEATURE_EXTRACTORS[0].split("_")[:2])] + if args.dataset == 'pubfig': + CLOAK_DIR = 'pubfig_tm{}_tgt57_r1.0_th0.01'.format(non_robust) + CLOAK_DIR = os.path.join(RES_DIR, CLOAK_DIR) + RES = pickle.load(open(os.path.join(CLOAK_DIR, "cloak_data.p"), 'rb')) + cloak_data = RES['cloak_data'] + elif args.dataset == 'scrub': + CLOAK_DIR = 'scrub_tm{}_tgtPatrick_Dempsey_r1.0_th0.01'.format(non_robust) + CLOAK_DIR = os.path.join(RES_DIR, CLOAK_DIR) + RES = pickle.load(open(os.path.join(CLOAK_DIR, "cloak_data.p"), 'rb')) + cloak_data = RES['cloak_data'] + else: + cloak_data.target_path, cloak_data.target_data = cloak_data.select_target_label(feature_extractors_ls, + FEATURE_EXTRACTORS) + + RES_FILE_NAME = "{}_tm{}_tgt{}_r{}_th{}".format(args.dataset, MODEL_HASH, cloak_data.protect_class, RATIO, + DSSIM_THRESHOLD) + RES_FILE_NAME = os.path.join(RES_DIR, RES_FILE_NAME) + os.makedirs(RES_FILE_NAME, exist_ok=True) + + print("Protect Current Label Data...") + + cloak_image_X = diff_protected_data(sess, feature_extractors_ls, cloak_data.protect_train_X, + number_protect=num_protect, + target_X=cloak_data.target_data, sybil=False, th=DSSIM_THRESHOLD) + + cloak_data.cloaked_protect_train_X = cloak_image_X + RES['cloak_data'] = cloak_data + save_results(RES, os.path.join(RES_FILE_NAME, 'cloak_data.p')) + + +def parse_arguments(argv): + parser = argparse.ArgumentParser() + parser.add_argument('--gpu', type=str, + help='GPU id', default='0') + parser.add_argument('--dataset', type=str, + help='name of dataset', default='pubfig') + parser.add_argument('--model_idx', type=str, + help='teacher model index', default="3") + parser.add_argument('--th', type=float, default=0.01) + parser.add_argument('--protect_class', type=str, default=None) + + return parser.parse_args(argv) + + +if __name__ == '__main__': + args = parse_arguments(sys.argv[1:]) + perform_defense() diff --git a/fawkes/utils.py b/fawkes/utils.py new file mode 100644 index 0000000..c40deca --- /dev/null +++ b/fawkes/utils.py @@ -0,0 +1,410 @@ +import os +import pickle +import random + +import keras +import keras.backend as K +import numpy as np +import tensorflow as tf +from keras.applications.vgg16 import preprocess_input +from keras.preprocessing import image +from keras.utils import to_categorical +from sklearn.metrics import pairwise_distances + + +def fix_gpu_memory(mem_fraction=1): + os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' + gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=mem_fraction) + tf_config = tf.ConfigProto(gpu_options=gpu_options) + tf_config.gpu_options.allow_growth = True + tf_config.log_device_placement = False + init_op = tf.global_variables_initializer() + sess = tf.Session(config=tf_config) + sess.run(init_op) + K.set_session(sess) + return sess + + +def init_gpu(gpu_index, force=False): + if isinstance(gpu_index, list): + gpu_num = ','.join([str(i) for i in gpu_index]) + else: + gpu_num = str(gpu_index) + if "CUDA_VISIBLE_DEVICES" in os.environ and os.environ["CUDA_VISIBLE_DEVICES"] and not force: + print('GPU already initiated') + return + os.environ["CUDA_VISIBLE_DEVICES"] = gpu_num + sess = fix_gpu_memory() + return sess + + +def preprocess(X, method): + # assume color last + assert method in {'raw', 'imagenet', 'inception', 'mnist'} + + if method is 'raw': + pass + elif method is 'imagenet': + X = imagenet_preprocessing(X) + else: + raise Exception('unknown method %s' % method) + + return X + + +def reverse_preprocess(X, method): + # assume color last + assert method in {'raw', 'imagenet', 'inception', 'mnist'} + + if method is 'raw': + pass + elif method is 'imagenet': + X = imagenet_reverse_preprocessing(X) + else: + raise Exception('unknown method %s' % method) + + return X + + +def imagenet_preprocessing(x, data_format=None): + if data_format is None: + data_format = K.image_data_format() + assert data_format in ('channels_last', 'channels_first') + + x = np.array(x) + if data_format == 'channels_first': + # 'RGB'->'BGR' + if x.ndim == 3: + x = x[::-1, ...] + else: + x = x[:, ::-1, ...] + else: + # 'RGB'->'BGR' + x = x[..., ::-1] + + mean = [103.939, 116.779, 123.68] + std = None + + # Zero-center by mean pixel + if data_format == 'channels_first': + if x.ndim == 3: + x[0, :, :] -= mean[0] + x[1, :, :] -= mean[1] + x[2, :, :] -= mean[2] + if std is not None: + x[0, :, :] /= std[0] + x[1, :, :] /= std[1] + x[2, :, :] /= std[2] + else: + x[:, 0, :, :] -= mean[0] + x[:, 1, :, :] -= mean[1] + x[:, 2, :, :] -= mean[2] + if std is not None: + x[:, 0, :, :] /= std[0] + x[:, 1, :, :] /= std[1] + x[:, 2, :, :] /= std[2] + else: + x[..., 0] -= mean[0] + x[..., 1] -= mean[1] + x[..., 2] -= mean[2] + if std is not None: + x[..., 0] /= std[0] + x[..., 1] /= std[1] + x[..., 2] /= std[2] + + return x + + +def imagenet_reverse_preprocessing(x, data_format=None): + import keras.backend as K + """ Reverse preprocesses a tensor encoding a batch of images. + # Arguments + x: input Numpy tensor, 4D. + data_format: data format of the image tensor. + # Returns + Preprocessed tensor. + """ + x = np.array(x) + if data_format is None: + data_format = K.image_data_format() + assert data_format in ('channels_last', 'channels_first') + + if data_format == 'channels_first': + if x.ndim == 3: + # Zero-center by mean pixel + x[0, :, :] += 103.939 + x[1, :, :] += 116.779 + x[2, :, :] += 123.68 + # 'BGR'->'RGB' + x = x[::-1, :, :] + else: + x[:, 0, :, :] += 103.939 + x[:, 1, :, :] += 116.779 + x[:, 2, :, :] += 123.68 + x = x[:, ::-1, :, :] + else: + # Zero-center by mean pixel + x[..., 0] += 103.939 + x[..., 1] += 116.779 + x[..., 2] += 123.68 + # 'BGR'->'RGB' + x = x[..., ::-1] + return x + + +def imagenet_reverse_preprocessing_cntk(x, data_format=None): + import keras.backend as K + """ Reverse preprocesses a tensor encoding a batch of images. + # Arguments + x: input Numpy tensor, 4D. + data_format: data format of the image tensor. + # Returns + Preprocessed tensor. + """ + x = np.array(x) + if data_format is None: + data_format = K.image_data_format() + assert data_format in ('channels_last', 'channels_first') + + if data_format == 'channels_first': + # Zero-center by mean pixel + x[:, 0, :, :] += 114.0 + x[:, 1, :, :] += 114.0 + x[:, 2, :, :] += 114.0 + # 'BGR'->'RGB' + x = x[:, ::-1, :, :] + else: + # Zero-center by mean pixel + x[:, :, :, 0] += 114.0 + x[:, :, :, 1] += 114.0 + x[:, :, :, 2] += 114.0 + # 'BGR'->'RGB' + x = x[:, :, :, ::-1] + return x + + +def load_extractor(name): + model = keras.models.load_model("/home/shansixioing/cloak/models/extractors/{}_extract.h5".format(name)) + return model + + +def get_dataset_path(dataset): + if dataset == "webface": + train_data_dir = '/mnt/data/sixiongshan/data/webface/train' + test_data_dir = '/mnt/data/sixiongshan/data/webface/test' + number_classes = 10575 + number_samples = 475137 + + elif dataset == "vggface1": + train_data_dir = '/mnt/data/sixiongshan/data/vggface/train' + test_data_dir = '/mnt/data/sixiongshan/data/vggface/test' + number_classes = 2622 + number_samples = 1716436 // 3 + + elif dataset == "vggface2": + train_data_dir = '/mnt/data/sixiongshan/data/vggface2/train' + test_data_dir = '/mnt/data/sixiongshan/data/vggface2/test' + number_classes = 8631 + number_samples = 3141890 // 3 + + elif dataset == "scrub": + train_data_dir = '/mnt/data/sixiongshan/data/facescrub/keras_flow_dir/train' + test_data_dir = '/mnt/data/sixiongshan/data/facescrub/keras_flow_dir/test' + number_classes = 530 + number_samples = 57838 + + elif dataset == "youtubeface": + train_data_dir = '/mnt/data/sixiongshan/data/youtubeface/keras_flow_data/train_mtcnnpy_224' + test_data_dir = '/mnt/data/sixiongshan/data/youtubeface/keras_flow_data/test_mtcnnpy_224' + number_classes = 1283 + number_samples = 587137 // 5 + + elif dataset == "emily": + train_data_dir = '/mnt/data/sixiongshan/data/emface/train' + test_data_dir = '/mnt/data/sixiongshan/data/emface/test' + number_classes = 66 + number_samples = 6070 + + elif dataset == "pubfig": + train_data_dir = '/mnt/data/sixiongshan/data/pubfig/train' + test_data_dir = '/mnt/data/sixiongshan/data/pubfig/test' + number_classes = 65 + number_samples = 5979 + + elif dataset == "iris": + train_data_dir = '/mnt/data/sixiongshan/data/iris/train' + test_data_dir = '/mnt/data/sixiongshan/data/iris/test' + number_classes = 1000 + number_samples = 14000 + else: + print("Dataset {} does not exist... Abort".format(dataset)) + exit(1) + + return train_data_dir, test_data_dir, number_classes, number_samples + + +def normalize(x): + return x / np.linalg.norm(x, axis=1, keepdims=True) + + +class CloakData(object): + def __init__(self, dataset, img_shape=(224, 224), target_selection_tries=30, protect_class=None): + self.dataset = dataset + self.img_shape = img_shape + self.target_selection_tries = target_selection_tries + + self.train_data_dir, self.test_data_dir, self.number_classes, self.number_samples = get_dataset_path(dataset) + self.all_labels = sorted(list(os.listdir(self.train_data_dir))) + if protect_class: + self.protect_class = protect_class + else: + self.protect_class = random.choice(self.all_labels) + + self.sybil_class = random.choice([l for l in self.all_labels if l != self.protect_class]) + print("Protect label: {} | Sybil label: {}".format(self.protect_class, self.sybil_class)) + self.protect_train_X, self.protect_test_X = self.load_label_data(self.protect_class) + self.sybil_train_X, self.sybil_test_X = self.load_label_data(self.sybil_class) + + self.cloaked_protect_train_X = None + self.cloaked_sybil_train_X = None + + self.label2path_train, self.label2path_test, self.path2idx = self.build_data_mapping() + self.all_training_path = self.get_all_data_path(self.label2path_train) + self.all_test_path = self.get_all_data_path(self.label2path_test) + self.protect_class_path = self.get_class_image_files(os.path.join(self.train_data_dir, self.protect_class)) + self.sybil_class_path = self.get_class_image_files(os.path.join(self.train_data_dir, self.sybil_class)) + + print("Find {} protect images".format(len(self.protect_class_path))) + + def get_class_image_files(self, path): + return [os.path.join(path, f) for f in os.listdir(path)] + + def extractor_ls_predict(self, feature_extractors_ls, X): + feature_ls = [] + for extractor in feature_extractors_ls: + cur_features = extractor.predict(X) + feature_ls.append(cur_features) + concated_feature_ls = np.concatenate(feature_ls, axis=1) + concated_feature_ls = normalize(concated_feature_ls) + return concated_feature_ls + + def load_embeddings(self, feature_extractors_names): + dictionaries = [] + + for extractor_name in feature_extractors_names: + path2emb = pickle.load(open("/home/shansixioing/cloak/embs/{}_emb_norm.p".format(extractor_name), "rb")) + # path2emb = pickle.load(open("/home/shansixioing/cloak/embs/vggface2_inception_emb.p".format(extractor_name), "rb")) + dictionaries.append(path2emb) + merge_dict = {} + for k in dictionaries[0].keys(): + cur_emb = [dic[k] for dic in dictionaries] + merge_dict[k] = np.concatenate(cur_emb) + return merge_dict + + def select_target_label(self, feature_extractors_ls, feature_extractors_names, metric='l2'): + original_feature_x = self.extractor_ls_predict(feature_extractors_ls, self.protect_train_X) + + path2emb = self.load_embeddings(feature_extractors_names) + items = list(path2emb.items()) + paths = [p[0] for p in items] + embs = [p[1] for p in items] + embs = np.array(embs) + + pair_dist = pairwise_distances(original_feature_x, embs, 'l2') + max_sum = np.min(pair_dist, axis=0) + sorted_idx = np.argsort(max_sum)[::-1] + + highest_num = 0 + paired_target_X = None + final_target_class_path = None + for idx in sorted_idx[:2]: + target_class_path = paths[idx] + cur_target_X = self.load_dir(target_class_path) + cur_target_X = np.concatenate([cur_target_X, cur_target_X, cur_target_X]) + cur_tot_sum, cur_paired_target_X = self.calculate_dist_score(self.protect_train_X, cur_target_X, + feature_extractors_ls, + metric=metric) + if cur_tot_sum > highest_num: + highest_num = cur_tot_sum + paired_target_X = cur_paired_target_X + final_target_class_path = target_class_path + + np.random.shuffle(paired_target_X) + return final_target_class_path, paired_target_X + + def calculate_dist_score(self, a, b, feature_extractors_ls, metric='l2'): + features1 = self.extractor_ls_predict(feature_extractors_ls, a) + features2 = self.extractor_ls_predict(feature_extractors_ls, b) + + pair_cos = pairwise_distances(features1, features2, metric) + max_sum = np.min(pair_cos, axis=0) + max_sum_arg = np.argsort(max_sum)[::-1] + max_sum_arg = max_sum_arg[:len(a)] + max_sum = [max_sum[i] for i in max_sum_arg] + paired_target_X = [b[j] for j in max_sum_arg] + paired_target_X = np.array(paired_target_X) + return np.min(max_sum), paired_target_X + + def get_all_data_path(self, label2path): + all_paths = [] + for k, v in label2path.items(): + cur_all_paths = [os.path.join(k, cur_p) for cur_p in v] + all_paths.extend(cur_all_paths) + return all_paths + + def load_label_data(self, label): + train_label_path = os.path.join(self.train_data_dir, label) + test_label_path = os.path.join(self.test_data_dir, label) + train_X = self.load_dir(train_label_path) + test_X = self.load_dir(test_label_path) + return train_X, test_X + + def load_dir(self, path): + assert os.path.exists(path) + x_ls = [] + for file in os.listdir(path): + cur_path = os.path.join(path, file) + im = image.load_img(cur_path, target_size=self.img_shape) + im = image.img_to_array(im) + x_ls.append(im) + raw_x = np.array(x_ls) + return preprocess_input(raw_x) + + def build_data_mapping(self): + label2path_train = {} + label2path_test = {} + idx = 0 + path2idx = {} + for label_name in self.all_labels: + full_path_train = os.path.join(self.train_data_dir, label_name) + full_path_test = os.path.join(self.test_data_dir, label_name) + label2path_train[full_path_train] = list(os.listdir(full_path_train)) + label2path_test[full_path_test] = list(os.listdir(full_path_test)) + for img_file in os.listdir(full_path_train): + path2idx[os.path.join(full_path_train, img_file)] = idx + for img_file in os.listdir(full_path_test): + path2idx[os.path.join(full_path_test, img_file)] = idx + idx += 1 + return label2path_train, label2path_test, path2idx + + def generate_data_post_cloak(self, sybil=False): + assert self.cloaked_protect_train_X is not None + while True: + batch_X = [] + batch_Y = [] + cur_batch_path = random.sample(self.all_training_path, 32) + for p in cur_batch_path: + cur_y = self.path2idx[p] + if p in self.protect_class_path: + cur_x = random.choice(self.cloaked_protect_train_X) + elif sybil and (p in self.sybil_class): + cur_x = random.choice(self.cloaked_sybil_train_X) + else: + im = image.load_img(p, target_size=self.img_shape) + im = image.img_to_array(im) + cur_x = preprocess_input(im) + batch_X.append(cur_x) + batch_Y.append(cur_y) + batch_X = np.array(batch_X) + batch_Y = to_categorical(np.array(batch_Y), num_classes=self.number_classes) + yield batch_X, batch_Y