Categorising the dataset into four main categories
Posted Dec 20, 2018 |
Section 3.7
#!/usr/local/bin/python3
import json
import hashlib
import psycopg2
import pprint
import times
from fuzzywuzzy import fuzz
from difflib import SequenceMatcher
import numpy
import collections
def similar(a, b):
return SequenceMatcher(None, a, b).ratio()
def identify_family(family):
test_harness = {
'rat': [
'backdoor',
'hack',
'shell',
'dropper'
],
'ransomware': [
'ransom',
'crypt'
],
'miner': [
'miner',
'coin'
],
'adware': [
'pup',
'pua',
'adware',
'cleaner',
'keeper',
'opinion'
]
}
test_results = {}
# run through all of the items in harness
for testname, test in test_harness.items():
score = 0
# and score each of the terms in turn
for testitem in test:
single_item_scores = []
for single_family in family:
single_item_scores.append(
fuzz.token_set_ratio(single_family, testitem))
this_score = numpy.mean(single_item_scores)
# storing only the highest
if this_score > score:
score = this_score
# then add it to the overall result
test_results[testname] = score
sorted_list = sorted(test_results.items(), key=lambda x: x[1])
winner = sorted_list.pop()[0]
return ({'winner': winner, 'results': test_results})
conn = psycopg2.connect("dbname=elk_development user=postgres")
cur = conn.cursor()
# this query needs to collect all unclassified samples
cur.execute("""select sha, vendor_family, other_family
from
malware_metadata
where
classified_as is null
and processing_result = 'PROCESSED'
and (
vendor_family <> '' OR other_family <> ''
)
""")
rows = cur.fetchall()
for row in rows:
cursor = conn.cursor()
file_sha = row[0]
vendor_family = row[1]
other_family = row[2]
if vendor_family is None and other_family is None:
break
elif vendor_family == '':
composite_family = other_family.strip()
elif other_family == '' or other_family is None:
composite_family = vendor_family
else:
composite_family = vendor_family + ";" + other_family
composite_family = composite_family.lower().split(';')
# for each item in the list, match against against a term and store the highest rates
result = identify_family(composite_family)
cursor.execute(
"update malware_metadata set classified_as = %s, classification_scores = %s where sha = %s",
(result["winner"], json.dumps(result["results"]), file_sha))
conn.commit()
cursor.close()
print("***** Combined Families: ", result)