参考链接: Python中的numpy.binary_repr
#!/usr/bin/python ''' Extracts some basic features from PE files. Many of the features implemented have been used in previously published works. For more information, check out the following resources: * Schultz, et al., 2001: * Kolter and Maloof, 2006: http://www.jmlr.org/papers/volume7/kolter06a/kolter06a.pdf * Shafiq et al., 2009: https://www.researchgate.net/profile/Fauzan_Mirza/publication/242084613_A_Framework_for_Efficient_Mining_of_Structural_Information_to_Detect_Zero-Day_Malicious_Portable_Executables/links/0c96052e191668c3d5000000.pdf * Raman, 2012: http://2012.infosecsouthwest.com/files/speaker_materials/ISSW2012_Selecting_Features_to_Classify_Malware.pdf * Saxe and Berlin, 2015: https://arxiv.org/pdf/1508.03096.pdf
It may be useful to do feature selection to reduce this set of features to a meaningful set for your modeling problem. '''
import re import lief import hashlib import numpy as np from sklearn.feature_extraction import FeatureHasher
class FeatureType(object): ''' Base class from which each feature type may inherit '''
name = '' dim = 0
def __repr__(self): return '{}({})'.format(self.name, self.dim)
def raw_features(self, bytez, lief_binary): ''' Generate a JSON-able representation of the file ''' raise (NotImplemented)
def process_raw_features(self, raw_obj): ''' Generate a feature vector from the raw features ''' raise (NotImplemented)
def feature_vector(self, bytez, lief_binary): ''' Directly calculate the feature vector from the sample itself. This should only be implemented differently if there are significant speedups to be gained from combining the two functions. ''' return self.process_raw_features(self.raw_features(bytez, lief_binary))
class ByteHistogram(FeatureType): ''' Byte histogram (count + non-normalized) over the entire binary file '''
name = 'histogram' dim = 256
def __init__(self): super(FeatureType, self).__init__()
def raw_features(self, bytez, lief_binary): counts = np.bincount(np.frombuffer(bytez, dtype=np.uint8), minlength=256) return counts.tolist()
def process_raw_features(self, raw_obj): counts = np.array(raw_obj, dtype=np.float32) sum = counts.sum() normalized = counts / sum return normalized
class ByteEntropyHistogram(FeatureType): ''' 2d byte/entropy histogram based loosely on (Saxe and Berlin, 2015). This roughly approximates the joint probability of byte value and local entropy. See Section 2.1.1 in https://arxiv.org/pdf/1508.03096.pdf for more info. '''
name = 'byteentropy' dim = 256
def __init__(self, step=1024, window=2048): super(FeatureType, self).__init__() self.window = window self.step = step
def _entropy_bin_counts(self, block): # coarse histogram, 16 bytes per bin c = np.bincount(block >> 4, minlength=16) # 16-bin histogram p = c.astype(np.float32) / self.window wh = np.where(c)[0] H = np.sum(-p[wh] * np.log2( p[wh])) * 2 # * x2 b.c. we reduced information by half: 256 bins (8 bits) to 16 bins (4 bits)
Hbin = int(H * 2) # up to 16 bins (max entropy is 8 bits) if Hbin == 16: # handle entropy = 8.0 bits Hbin = 15
return Hbin, c
def raw_features(self, bytez, lief_binary): output = np.zeros((16, 16), dtype=np.int) a = np.frombuffer(bytez, dtype=np.uint8) if a.shape[0] < self.window: Hbin, c = self._entropy_bin_counts(a) output[Hbin, :] += c else: # strided trick from here: http://www.rigtorp.se/2011/01/01/rolling-statistics-numpy.html shape = a.shape[:-1] + (a.shape[-1] - self.window + 1, self.window) strides = a.strides + (a.strides[-1],) blocks = np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::self.step, :]
# from the blocks, compute histogram for block in blocks: Hbin, c = self._entropy_bin_counts(block) output[Hbin, :] += c
return output.flatten().tolist()
def process_raw_features(self, raw_obj): counts = np.array(raw_obj, dtype=np.float32) sum = counts.sum() normalized = counts / sum return normalized
class SectionInfo(FeatureType): ''' Information about section names, sizes and entropy. Uses hashing trick to summarize all this section info into a feature vector. '''
name = 'section' dim = 5 + 50 + 50 + 50 + 50 + 50
def __init__(self): super(FeatureType, self).__init__()
@staticmethod def _properties(s): return [str(c).split('.')[-1] for c in s.characteristics_lists]
def raw_features(self, bytez, lief_binary): if lief_binary is None: return {"entry": "", "sections": []}
# properties of entry point, or if invalid, the first executable section try: entry_section = lief_binary.section_from_offset(lief_binary.entrypoint).name except lief.not_found: # bad entry point, let's find the first executable section entry_section = "" for s in lief_binary.sections: if lief.PE.SECTION_CHARACTERISTICS.MEM_EXECUTE in s.characteristics_lists: entry_section = s.name break
raw_obj = {"entry": entry_section} raw_obj["sections"] = [{ 'name': s.name, 'size': s.size, 'entropy': s.entropy, 'vsize': s.virtual_size, 'props': self._properties(s) } for s in lief_binary.sections] return raw_obj
def process_raw_features(self, raw_obj): sections = raw_obj['sections'] general = [ len(sections), # total number of sections # number of sections with nonzero size sum(1 for s in sections if s['size'] == 0), # number of sections with an empty name sum(1 for s in sections if s['name'] == ""), # number of RX sum(1 for s in sections if 'MEM_READ' in s['props'] and 'MEM_EXECUTE' in s['props']), # number of W sum(1 for s in sections if 'MEM_WRITE' in s['props']) ] # gross characteristics of each section section_sizes = [(s['name'], s['size']) for s in sections] section_sizes_hashed = FeatureHasher(50, input_type="pair").transform([section_sizes]).toarray()[0] section_entropy = [(s['name'], s['entropy']) for s in sections] section_entropy_hashed = FeatureHasher(50, input_type="pair").transform([section_entropy]).toarray()[0] section_vsize = [(s['name'], s['vsize']) for s in sections] section_vsize_hashed = FeatureHasher(50, input_type="pair").transform([section_vsize]).toarray()[0] entry_name_hashed = FeatureHasher(50, input_type="string").transform([raw_obj['entry']]).toarray()[0] characteristics = [p for s in sections for p in s['props'] if s['name'] == raw_obj['entry']] characteristics_hashed = FeatureHasher(50, input_type="string").transform([characteristics]).toarray()[0]
return np.hstack([ general, section_sizes_hashed, section_entropy_hashed, section_vsize_hashed, entry_name_hashed, characteristics_hashed ]).astype(np.float32)
class ImportsInfo(FeatureType): ''' Information about imported libraries and functions from the import address table. Note that the total number of imported functions is contained in GeneralFileInfo. '''
name = 'imports' dim = 1280
def __init__(self): super(FeatureType, self).__init__()
def raw_features(self, bytez, lief_binary): imports = {} if lief_binary is None: return imports
for lib in lief_binary.imports: if lib.name not in imports: imports[lib.name] = [] # libraries can be duplicated in listing, extend instead of overwrite
# Clipping assumes there are diminishing returns on the discriminatory power of imported functions # beyond the first 10000 characters, and this will help limit the dataset size imports[lib.name].extend([entry.name[:10000] for entry in lib.entries])
return imports
def process_raw_features(self, raw_obj): # unique libraries libraries = list(set([l.lower() for l in raw_obj.keys()])) libraries_hashed = FeatureHasher(256, input_type="string").transform([libraries]).toarray()[0]
# A string like "kernel32.dll:CreateFileMappingA" for each imported function imports = [lib.lower() + ':' + e for lib, elist in raw_obj.items() for e in elist] imports_hashed = FeatureHasher(1024, input_type="string").transform([imports]).toarray()[0]
# Two separate elements: libraries (alone) and fully-qualified names of imported functions return np.hstack([libraries_hashed, imports_hashed]).astype(np.float32)
class ExportsInfo(FeatureType): ''' Information about exported functions. Note that the total number of exported functions is contained in GeneralFileInfo. '''
name = 'exports' dim = 128
def __init__(self): super(FeatureType, self).__init__()
def raw_features(self, bytez, lief_binary): if lief_binary is None: return []
# Clipping assumes there are diminishing returns on the discriminatory power of exports beyond # the first 10000 characters, and this will help limit the dataset size clipped_exports = [export[:10000] for export in lief_binary.exported_functions]
return clipped_exports
def process_raw_features(self, raw_obj): exports_hashed = FeatureHasher(128, input_type="string").transform([raw_obj]).toarray()[0] return exports_hashed.astype(np.float32)
class GeneralFileInfo(FeatureType): ''' General information about the file '''
name = 'general' dim = 10
def __init__(self): super(FeatureType, self).__init__()
def raw_features(self, bytez, lief_binary): if lief_binary is None: return { 'size': len(bytez), 'vsize': 0, 'has_debug': 0, 'exports': 0, 'imports': 0, 'has_relocations': 0, 'has_resources': 0, 'has_signature': 0, 'has_tls': 0, 'symbols': 0 }
return { 'size': len(bytez), 'vsize': lief_binary.virtual_size, 'has_debug': int(lief_binary.has_debug), 'exports': len(lief_binary.exported_functions), 'imports': len(lief_binary.imported_functions), 'has_relocations': int(lief_binary.has_relocations), 'has_resources': int(lief_binary.has_resources), 'has_signature': int(lief_binary.has_signature), 'has_tls': int(lief_binary.has_tls), 'symbols': len(lief_binary.symbols), }
def process_raw_features(self, raw_obj): return np.asarray( [ raw_obj['size'], raw_obj['vsize'], raw_obj['has_debug'], raw_obj['exports'], raw_obj['imports'], raw_obj['has_relocations'], raw_obj['has_resources'], raw_obj['has_signature'], raw_obj['has_tls'], raw_obj['symbols'] ], dtype=np.float32)
class HeaderFileInfo(FeatureType): ''' Machine, architecure, OS, linker and other information extracted from header '''
name = 'header' dim = 62
def __init__(self): super(FeatureType, self).__init__()
def raw_features(self, bytez, lief_binary): raw_obj = {} raw_obj['coff'] = {'timestamp': 0, 'machine': "", 'characteristics': []} raw_obj['optional'] = { 'subsystem': "", 'dll_characteristics': [], 'magic': "", 'major_image_version': 0, 'minor_image_version': 0, 'major_linker_version': 0, 'minor_linker_version': 0, 'major_operating_system_version': 0, 'minor_operating_system_version': 0, 'major_subsystem_version': 0, 'minor_subsystem_version': 0, 'sizeof_code': 0, 'sizeof_headers': 0, 'sizeof_heap_commit': 0 } if lief_binary is None: return raw_obj
raw_obj['coff']['timestamp'] = lief_binary.header.time_date_stamps raw_obj['coff']['machine'] = str(lief_binary.header.machine).split('.')[-1] raw_obj['coff']['characteristics'] = [str(c).split('.')[-1] for c in lief_binary.header.characteristics_list] raw_obj['optional']['subsystem'] = str(lief_binary.optional_header.subsystem).split('.')[-1] raw_obj['optional']['dll_characteristics'] = [ str(c).split('.')[-1] for c in lief_binary.optional_header.dll_characteristics_lists ] raw_obj['optional']['magic'] = str(lief_binary.optional_header.magic).split('.')[-1] raw_obj['optional']['major_image_version'] = lief_binary.optional_header.major_image_version raw_obj['optional']['minor_image_version'] = lief_binary.optional_header.minor_image_version raw_obj['optional']['major_linker_version'] = lief_binary.optional_header.major_linker_version raw_obj['optional']['minor_linker_version'] = lief_binary.optional_header.minor_linker_version raw_obj['optional'][ 'major_operating_system_version'] = lief_binary.optional_header.major_operating_system_version raw_obj['optional'][ 'minor_operating_system_version'] = lief_binary.optional_header.minor_operating_system_version raw_obj['optional']['major_subsystem_version'] = lief_binary.optional_header.major_subsystem_version raw_obj['optional']['minor_subsystem_version'] = lief_binary.optional_header.minor_subsystem_version raw_obj['optional']['sizeof_code'] = lief_binary.optional_header.sizeof_code raw_obj['optional']['sizeof_headers'] = lief_binary.optional_header.sizeof_headers raw_obj['optional']['sizeof_heap_commit'] = lief_binary.optional_header.sizeof_heap_commit return raw_obj
def process_raw_features(self, raw_obj): return np.hstack([ raw_obj['coff']['timestamp'], FeatureHasher(10, input_type="string").transform([[raw_obj['coff']['machine']]]).toarray()[0], FeatureHasher(10, input_type="string").transform([raw_obj['coff']['characteristics']]).toarray()[0], FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['subsystem']]]).toarray()[0], FeatureHasher(10, input_type="string").transform([raw_obj['optional']['dll_characteristics']]).toarray()[0], FeatureHasher(10, input_type="string").transform([[raw_obj['optional']['magic']]]).toarray()[0], raw_obj['optional']['major_image_version'], raw_obj['optional']['minor_image_version'], raw_obj['optional']['major_linker_version'], raw_obj['optional']['minor_linker_version'], raw_obj['optional']['major_operating_system_version'], raw_obj['optional']['minor_operating_system_version'], raw_obj['optional']['major_subsystem_version'], raw_obj['optional']['minor_subsystem_version'], raw_obj['optional']['sizeof_code'], raw_obj['optional']['sizeof_headers'], raw_obj['optional']['sizeof_heap_commit'], ]).astype(np.float32)
class StringExtractor(FeatureType): ''' Extracts strings from raw byte stream '''
name = 'strings' dim = 1 + 1 + 1 + 96 + 1 + 1 + 1 + 1 + 1
def __init__(self): super(FeatureType, self).__init__() # all consecutive runs of 0x20 - 0x7f that are 5+ characters self._allstrings = re.compile(b'[\x20-\x7f]{5,}') # occurances of the string 'C:\'. Not actually extracting the path self._paths = re.compile(b'c:\\\\', re.IGNORECASE) # occurances of http:// or https://. Not actually extracting the URLs self._urls = re.compile(b'https?://', re.IGNORECASE) # occurances of the string prefix HKEY_. No actually extracting registry names self._registry = re.compile(b'HKEY_') # crude evidence of an MZ header (dropper?) somewhere in the byte stream self._mz = re.compile(b'MZ')
def raw_features(self, bytez, lief_binary): allstrings = self._allstrings.findall(bytez) if allstrings: # statistics about strings: string_lengths = [len(s) for s in allstrings] avlength = sum(string_lengths) / len(string_lengths) # map printable characters 0x20 - 0x7f to an int array consisting of 0-95, inclusive as_shifted_string = [b - ord(b'\x20') for b in b''.join(allstrings)] c = np.bincount(as_shifted_string, minlength=96) # histogram count # distribution of characters in printable strings csum = c.sum() p = c.astype(np.float32) / csum wh = np.where(c)[0] H = np.sum(-p[wh] * np.log2(p[wh])) # entropy else: avlength = 0 c = np.zeros((96,), dtype=np.float32) H = 0 csum = 0
return { 'numstrings': len(allstrings), 'avlength': avlength, 'printabledist': c.tolist(), # store non-normalized histogram 'printables': int(csum), 'entropy': float(H), 'paths': len(self._paths.findall(bytez)), 'urls': len(self._urls.findall(bytez)), 'registry': len(self._registry.findall(bytez)), 'MZ': len(self._mz.findall(bytez)) }
def process_raw_features(self, raw_obj): hist_divisor = float(raw_obj['printables']) if raw_obj['printables'] > 0 else 1.0 return np.hstack([ raw_obj['numstrings'], raw_obj['avlength'], raw_obj['printables'], np.asarray(raw_obj['printabledist']) / hist_divisor, raw_obj['entropy'], raw_obj['paths'], raw_obj['urls'], raw_obj['registry'], raw_obj['MZ'] ]).astype(np.float32)
class PEFeatureExtractor(object): ''' Extract useful features from a PE file, and return as a vector of fixed size. '''
features = [ ByteHistogram(), ByteEntropyHistogram(), StringExtractor(), GeneralFileInfo(), HeaderFileInfo(), SectionInfo(), ImportsInfo(), ExportsInfo() ] dim = sum([fe.dim for fe in features])
def raw_features(self, bytez): try: lief_binary = lief.PE.parse(list(bytez)) except (lief.bad_format, lief.bad_file, lief.pe_error, lief.parser_error, RuntimeError) as e: print("lief error: ", str(e)) lief_binary = None except Exception: # everything else (KeyboardInterrupt, SystemExit, ValueError): raise
features = {"sha256": hashlib.sha256(bytez).hexdigest()} features.update({fe.name: fe.raw_features(bytez, lief_binary) for fe in self.features}) return features
def process_raw_features(self, raw_obj): feature_vectors = [fe.process_raw_features(raw_obj[fe.name]) for fe in self.features] return np.hstack(feature_vectors).astype(np.float32)
def feature_vector(self, bytez): return self.process_raw_features(self.raw_features(bytez))
如有侵权,请联系 cloudcommunity@tencent.com 删除。
如有侵权,请联系 cloudcommunity@tencent.com 删除。