Antimalweb / URLNet

Code for the paper URLNet - Learning a URL Representation with Deep Learning for Malicious URL Detection
Apache License 2.0
152 stars 60 forks source link

Prediction function #9

Open mdavis332 opened 5 years ago

mdavis332 commented 5 years ago

I was unable to find an exportable function that could be called from another python script to provide a prediction or score based on a single or list of URLs.

i.e., once the model is developed, how does one use it to make predictions on a naked URL with no label?

Is that something I missed within this code or is the code solely meant as an academic proof of model development?

eladwass commented 5 years ago

Indeed I couldn't find one too. However you can use test.py for that by just adding a dummy label and then use to predict/score column...

mdavis332 commented 5 years ago

I wound up using/rewriting most of test.py into an init.py with an importable predict(url) function just to take a single unlabeled URL and provide a score. I totally violated the DRY principle since there's so much shared code, but it was a quick hack.

eladwass commented 5 years ago

sounds great! Can you share with us?

n-skriabin commented 3 years ago

@mdavis332 I am trying to do the same for my scientific work. Can you share your solution with us?

n-skriabin commented 3 years ago

@eladwass In general, I had to redo everything myself) I just combined the test.py and utils.py files in such a way that using the neural network was reduced to calling one method 'main_logic'.

I apologize in advance for the terrible style of the code, I made a lot of attempts until I got a workable code, and besides, I have only been learning python for a couple of weeks.

import os
import pickle 
import time 
from tqdm import tqdm
import argparse
import numpy as np 
import pickle
from bisect import bisect_left
from tensorflow.contrib import learn 
from tflearn.data_utils import to_categorical, pad_sequences

import tensorflow as tf

def main_logic(input_url):
    def get_word_vocab(urls, max_length_words, min_word_freq=0):
        vocab_processor = learn.preprocessing.VocabularyProcessor(max_length_words, min_frequency=min_word_freq) 
        start = time.time()
        x = np.array(list(vocab_processor.fit_transform(urls)))
        vocab_dict = vocab_processor.vocabulary_._mapping
        reverse_dict = dict(zip(vocab_dict.values(), vocab_dict.keys()))
        return x, reverse_dict 

    def get_words(x, reverse_dict, delimit_mode, urls=None): 
        processed_x = []
        if delimit_mode == 0: 
            for url in x: 
                words = []
                for word_id in url: 
                    if word_id != 0: 
                        words.append(reverse_dict[word_id])
                    else: 
                        break
                processed_x.append(words) 
        elif delimit_mode == 1:
            for i in range(x.shape[0]):
                word_url = x[i]
                raw_url = urls[i]
                words = []
                for w in range(len(word_url)): 
                    word_id = word_url[w]
                    if word_id == 0: 
                        words.extend(list(raw_url))
                        break
                    else: 
                        word = reverse_dict[word_id]
                        idx = raw_url.index(word) 
                        special_chars = list(raw_url[0:idx])
                        words.extend(special_chars) 
                        words.append(word) 
                        raw_url = raw_url[idx+len(word):]
                        if w == len(word_url) - 1: 
                            words.extend(list(raw_url))
                processed_x.append(words)
        return processed_x 

    def get_char_ngrams(ngram_len, word): 
        word = "<" + word + ">" 
        chars = list(word) 
        begin_idx = 0
        ngrams = []
        while (begin_idx + ngram_len) <= len(chars): 
            end_idx = begin_idx + ngram_len 
            ngrams.append("".join(chars[begin_idx:end_idx])) 
            begin_idx += 1 
        return ngrams 

    def char_id_x(urls, char_dict, max_len_chars): 
        chared_id_x = []
        for url in urls: 
            url = list(url) 
            url_in_char_id = []
            l = min(len(url), max_len_chars)
            for i in range(l): 
                c = url[i] 
                try:
                    c_id = char_dict[c] 
                except KeyError:
                    c_id = 0
                url_in_char_id.append(c_id) 
            chared_id_x.append(url_in_char_id) 
        return chared_id_x 

    def ngram_id_x(word_x, max_len_subwords, high_freq_words=None):   
        char_ngram_len = 1
        all_ngrams = set() 
        ngramed_x = []
        all_words = set() 
        worded_x = []
        for url in word_x: 
            url_in_ngrams = []
            url_in_words = []
            words = url
            for word in words:
                ngrams = get_char_ngrams(char_ngram_len, word) 
                if (len(ngrams) > max_len_subwords) or \
                    (high_freq_words is not None and len(word)>1 and not is_in(high_freq_words, word)):  
                    all_ngrams.update(ngrams[:max_len_subwords])
                    url_in_ngrams.append(ngrams[:max_len_subwords]) 
                    all_words.add("<UNKNOWN>")
                    url_in_words.append("<UNKNOWN>")
                else:     
                    all_ngrams.update(ngrams)
                    url_in_ngrams.append(ngrams) 
                    all_words.add(word) 
                    url_in_words.append(word) 
            ngramed_x.append(url_in_ngrams)
            worded_x.append(url_in_words) 

        all_ngrams = list(all_ngrams) 
        ngrams_dict = dict()
        for i in range(len(all_ngrams)):  
            ngrams_dict[all_ngrams[i]] = i+1
        all_words = list(all_words) 
        words_dict = dict() 
        for i in range(len(all_words)): 
            words_dict[all_words[i]] = i+1   
        ngramed_id_x = []
        for ngramed_url in ngramed_x: 
            url_in_ngrams = []
            for ngramed_word in ngramed_url: 
                ngram_ids = [ngrams_dict[x] for x in ngramed_word] 
                url_in_ngrams.append(ngram_ids) 
            ngramed_id_x.append(url_in_ngrams)  
        worded_id_x = []
        for worded_url in worded_x: 
            word_ids = [words_dict[x] for x in worded_url]
            worded_id_x.append(word_ids) 

        return ngramed_id_x, ngrams_dict, worded_id_x, words_dict 

    def ngram_id_x_from_dict(word_x, max_len_subwords, ngram_dict, word_dict = None): 
        char_ngram_len = 1
        ngramed_id_x = [] 
        worded_id_x = []
        if word_dict:
            word_vocab = sorted(list(word_dict.keys()))
        for url in word_x: 
            url_in_ngrams = [] 
            url_in_words = [] 
            words = url
            for word in words:
                ngrams = get_char_ngrams(char_ngram_len, word) 
                if len(ngrams) > max_len_subwords:
                    word = "<UNKNOWN>"  
                ngrams_id = [] 
                for ngram in ngrams: 
                    if ngram in ngram_dict: 
                        ngrams_id.append(ngram_dict[ngram]) 
                    else: 
                        ngrams_id.append(0) 
                url_in_ngrams.append(ngrams_id)
                if is_in(word_vocab, word): 
                    word_id = word_dict[word]
                else: 
                    word_id = word_dict["<UNKNOWN>"] 
                url_in_words.append(word_id)
            ngramed_id_x.append(url_in_ngrams)
            worded_id_x.append(url_in_words)

        return ngramed_id_x, worded_id_x 

    def is_in(a,x): 
        i = bisect_left(a,x)
        if i != len(a) and a[i] == x: 
            return True 
        else:
            return False 

    def pad_seq(urls, max_d1=0, max_d2=0, embedding_size=128): 
        if max_d1 == 0 and max_d2 == 0: 
            for url in urls: 
                if len(url) > max_d1: 
                    max_d1 = len(url) 
                for word in url: 
                    if len(word) > max_d2: 
                        max_d2 = len(word) 
        pad_idx = np.zeros((len(urls), max_d1, max_d2, embedding_size))
        pad_urls = np.zeros((len(urls), max_d1, max_d2))
        pad_vec = [1 for i in range(embedding_size)]
        for d0 in range(len(urls)): 
            url = urls[d0]
            for d1 in range(len(url)): 
                if d1 < max_d1: 
                    word = url[d1]
                    for d2 in range(len(word)): 
                        if d2 < max_d2: 
                            pad_urls[d0,d1,d2] = word[d2]
                            pad_idx[d0,d1,d2] = pad_vec
        return pad_urls, pad_idx

    def pad_seq_in_word(urls, max_d1=0, embedding_size=128):
        if max_d1 == 0: 
            url_lens = [len(url) for url in urls]
            max_d1 = max(url_lens)
        pad_urls = np.zeros((len(urls), max_d1))
        for d0 in range(len(urls)): 
            url = urls[d0]
            for d1 in range(len(url)): 
                if d1 < max_d1: 
                    pad_urls[d0,d1] = url[d1]
        return pad_urls 

    def softmax(x): 
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum() 

    def batch_iter(data, batch_size, num_epochs, shuffle=True): 
        data = np.array(data) 
        data_size = len(data) 
        num_batches_per_epoch = int((len(data)-1)/batch_size) + 1 
        for epoch in range(num_epochs): 
            if shuffle: 
                shuffle_indices = np.random.permutation(np.arange(data_size)) 
                shuffled_data = data[shuffle_indices]
            else: 
                shuffled_data = data 
            for batch_num in range(num_batches_per_epoch): 
                start_idx = batch_num * batch_size 
                end_idx = min((batch_num+1) * batch_size, data_size)
                yield shuffled_data[start_idx:end_idx]

    def save_test_result(labels, all_predictions, all_scores): 
        output_labels = []
        for i in labels: 
            if i == 1: 
                output_labels.append(i) 
            else: 
                output_labels.append(-1) 
        output_preds = [] 
        for i in all_predictions: 
            if i == 1: 
                output_preds.append(i) 
            else: 
                output_preds.append(-1) 
        softmax_scores = [softmax(i) for i in all_scores]
        for i in range(len(output_labels)):
            output = str(int(output_labels[i])) + '\t' + str(int(output_preds[i])) + '\t' + str(softmax_scores[i][1]) + '\n'
            if softmax_scores[i][1] >= 0.75:
                return True, softmax_scores[i][1]
            else:
                return False, softmax_scores[i][1]

    def test_step(x, emb_mode):
        p = 1.0
        if emb_mode == 1: 
            feed_dict = {
                input_x_char_seq: x[0],
                dropout_keep_prob: p}  
        elif emb_mode == 2: 
            feed_dict = {
                input_x_word: x[0],
                dropout_keep_prob: p}
        elif emb_mode == 3: 
            feed_dict = {
                input_x_char_seq: x[0],
                input_x_word: x[1],
                dropout_keep_prob: p}
        elif emb_mode == 4: 
            feed_dict = {
                input_x_word: x[0],
                input_x_char: x[1],
                input_x_char_pad_idx: x[2],
                dropout_keep_prob: p}
        elif emb_mode == 5:  
            feed_dict = {
                input_x_char_seq: x[0],
                input_x_word: x[1],
                input_x_char: x[2],
                input_x_char_pad_idx: x[3],
                dropout_keep_prob: p}
        preds, s = sess.run([predictions, scores], feed_dict)
        return preds, s

    default_max_len_words = 100
    default_max_len_chars = 100
    default_max_len_subwords = 20
    default_delimit_mode = 1
    default_emb_dim = 32
    default_emb_mode = 5
    default_batch_size = 128
    urls = []
    labels = [ 1 ]

    urls.append(input_url)

    x, word_reverse_dict = get_word_vocab(urls, default_max_len_words) 
    word_x = get_words(x, word_reverse_dict, default_delimit_mode, urls) 
    ngram_dict = pickle.load(open("runs/10000/subwords_dict.p", "rb"))
    word_dict = pickle.load(open("runs/10000/words_dict.p", "rb"))
    ngramed_id_x, worded_id_x = ngram_id_x_from_dict(word_x, default_max_len_subwords, ngram_dict, word_dict) 
    chars_dict = pickle.load(open("runs/10000/chars_dict.p", "rb"))          
    chared_id_x = char_id_x(urls, chars_dict, default_max_len_chars)

    checkpoint_file = tf.train.latest_checkpoint("runs/10000/checkpoints/")
    graph = tf.Graph() 
    with graph.as_default(): 
        session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        session_conf.gpu_options.allow_growth=True 
        sess = tf.Session(config=session_conf)
        with sess.as_default(): 
            saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file) 

            if default_emb_mode in [1, 3, 5]: 
                input_x_char_seq = graph.get_operation_by_name("input_x_char_seq").outputs[0]
            if default_emb_mode in [2, 3, 4, 5]:
                input_x_word = graph.get_operation_by_name("input_x_word").outputs[0]
            if default_emb_mode in [4, 5]:
                input_x_char = graph.get_operation_by_name("input_x_char").outputs[0]
                input_x_char_pad_idx = graph.get_operation_by_name("input_x_char_pad_idx").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name("dropout_keep_prob").outputs[0] 

            predictions = graph.get_operation_by_name("output/predictions").outputs[0]
            scores = graph.get_operation_by_name("output/scores").outputs[0]

            if default_emb_mode == 1: 
                batches = batch_iter(list(chared_id_x), default_batch_size, 1, shuffle=False) 
            elif default_emb_mode == 2: 
                batches = batch_iter(list(worded_id_x), default_batch_size, 1, shuffle=False) 
            elif default_emb_mode == 3: 
                batches = batch_iter(list(zip(chared_id_x, worded_id_x)), default_batch_size, 1, shuffle=False)
            elif default_emb_mode == 4: 
                batches = batch_iter(list(zip(ngramed_id_x, worded_id_x)), default_batch_size, 1, shuffle=False)
            elif default_emb_mode == 5: 
                batches = batch_iter(list(zip(ngramed_id_x, worded_id_x, chared_id_x)), default_batch_size, 1, shuffle=False)    
            all_predictions = []
            all_scores = []

            nb_batches = int(len(labels) / default_batch_size)
            if len(labels) % default_batch_size != 0: 
              nb_batches += 1 
            it = tqdm(range(nb_batches), desc="emb_mode {} delimit_mode {} test_size {}".format(default_emb_mode, default_delimit_mode, len(labels)), ncols=0)
            for idx in it:
                batch = next(batches)

                if default_emb_mode == 1: 
                    x_char_seq = batch 
                elif default_emb_mode == 2: 
                    x_word = batch 
                elif default_emb_mode == 3: 
                    x_char_seq, x_word = zip(*batch) 
                elif default_emb_mode == 4: 
                    x_char, x_word = zip(*batch)
                elif default_emb_mode == 5: 
                    x_char, x_word, x_char_seq = zip(*batch)        

                x_batch = []    
                if default_emb_mode in[1, 3, 5]: 
                    x_char_seq = pad_seq_in_word(x_char_seq, default_max_len_chars) 
                    x_batch.append(x_char_seq)
                if default_emb_mode in [2, 3, 4, 5]:
                    x_word = pad_seq_in_word(x_word, default_max_len_words) 
                    x_batch.append(x_word)
                if default_emb_mode in [4, 5]:
                    x_char, x_char_pad_idx = pad_seq(x_char, default_max_len_words, default_max_len_subwords, default_emb_dim)
                    x_batch.extend([x_char, x_char_pad_idx])

                batch_predictions, batch_scores = test_step(x_batch, default_emb_mode)            
                all_predictions = np.concatenate([all_predictions, batch_predictions]) 
                all_scores.extend(batch_scores) 

                it.set_postfix()

    res = save_test_result(labels, all_predictions, all_scores)
    return res