thesis.yaxs.net

Categorising the dataset into four main categories

Posted Dec 20, 2018 | Section 3.7

#!/usr/local/bin/python3
import json
import hashlib
import psycopg2
import pprint
import times
from fuzzywuzzy import fuzz
from difflib import SequenceMatcher
import numpy
import collections

def similar(a, b):
  return SequenceMatcher(None, a, b).ratio()

def identify_family(family):
  test_harness = {
      'rat': [
          'backdoor',
          'hack',
          'shell',
          'dropper'
      ],
      'ransomware': [
          'ransom',
          'crypt'
      ],
      'miner': [
          'miner',
          'coin'
      ],
      'adware': [
          'pup',
          'pua',
          'adware',
          'cleaner',
          'keeper',
          'opinion'
      ]
  }

  test_results = {}
  # run through all of the items in harness
  for testname, test in test_harness.items():
    score = 0
    # and score each of the terms in turn
    for testitem in test:
      single_item_scores = []
      for single_family in family:
        single_item_scores.append(
            fuzz.token_set_ratio(single_family, testitem))
      this_score = numpy.mean(single_item_scores)
      # storing only the highest
      if this_score > score:
        score = this_score
    # then add it to the overall result
    test_results[testname] = score
  sorted_list = sorted(test_results.items(), key=lambda x: x[1])
  winner = sorted_list.pop()[0]
  return ({'winner': winner, 'results': test_results})


conn = psycopg2.connect("dbname=elk_development user=postgres")
cur = conn.cursor()

# this query needs to collect all unclassified samples
cur.execute("""select sha, vendor_family, other_family 
                from 
                  malware_metadata 
                where 
                  classified_as is null
                  and processing_result = 'PROCESSED'
                  and (
                    vendor_family <> '' OR other_family <> ''
                  )
                    """)
rows = cur.fetchall()

for row in rows:
  cursor = conn.cursor()
  file_sha = row[0]
  vendor_family = row[1]
  other_family = row[2]
  if vendor_family is None and other_family is None:
    break
  elif vendor_family == '':
    composite_family = other_family.strip()
  elif other_family == '' or other_family is None:
    composite_family = vendor_family
  else:
    composite_family = vendor_family + ";" + other_family

  composite_family = composite_family.lower().split(';')
  # for each item in the list, match against against a term and store the highest rates
  result = identify_family(composite_family)

  cursor.execute(
      "update malware_metadata set classified_as = %s, classification_scores = %s where sha = %s",
      (result["winner"], json.dumps(result["results"]), file_sha))
  conn.commit()
  cursor.close()
  print("***** Combined Families: ", result)