569 lines
23 KiB
Python
569 lines
23 KiB
Python
# app/analyzers/fuzzy.py
|
|
|
|
import pyssdeep
|
|
import json
|
|
import os
|
|
import hashlib
|
|
import configparser
|
|
import zlib
|
|
from datetime import datetime
|
|
from typing import Dict, List, Optional, Tuple, Any
|
|
from pathlib import Path
|
|
import binascii
|
|
|
|
class BlockData:
|
|
def __init__(self, raw_data: bytes, start_offset: int):
|
|
self.raw_data = raw_data
|
|
self.start_offset = start_offset
|
|
self.length = len(raw_data)
|
|
|
|
def _create_hex_dump(self) -> str:
|
|
"""Only created when displaying results"""
|
|
hex_lines = []
|
|
for i in range(0, self.length, 16):
|
|
chunk = self.raw_data[i:i + 16]
|
|
hex_values = ' '.join(f'{b:02x}' for b in chunk)
|
|
hex_values = hex_values.ljust(48)
|
|
hex_lines.append(f"{self.start_offset + i:08x} {hex_values}")
|
|
return '\n'.join(hex_lines)
|
|
|
|
def _create_ascii_repr(self) -> str:
|
|
"""Only created when displaying results"""
|
|
ascii_lines = []
|
|
for i in range(0, self.length, 16):
|
|
chunk = self.raw_data[i:i + 16]
|
|
ascii_str = ''.join(chr(b) if 32 <= b <= 126 else '.' for b in chunk)
|
|
ascii_lines.append(ascii_str)
|
|
return '\n'.join(ascii_lines)
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""Store absolute minimum in DB, converting bytes to base64 string"""
|
|
compressed = zlib.compress(self.raw_data)
|
|
return {
|
|
"o": self.start_offset, # Shortened key names
|
|
"d": binascii.b2a_base64(compressed).decode('ascii').strip() # Compress and convert to base64
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any]) -> 'BlockData':
|
|
"""Reconstruct from minimal data"""
|
|
compressed = binascii.a2b_base64(data["d"]) # Convert base64 back to bytes
|
|
raw_data = zlib.decompress(compressed)
|
|
return cls(raw_data, data["o"])
|
|
|
|
class BlockMetadata:
|
|
def __init__(self, index: int, block_size: int, hash_value: str, data: BlockData):
|
|
self.index = index
|
|
self.start_offset = data.start_offset
|
|
self.end_offset = data.start_offset + data.length
|
|
self.hash = hash_value
|
|
self.data = data
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return {
|
|
"i": self.index, # Shortened key names
|
|
"h": self.hash,
|
|
"d": self.data.to_dict()
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any], block_size: int) -> 'BlockMetadata':
|
|
block_data = BlockData.from_dict(data["d"])
|
|
return cls(data["i"], block_size, data["h"], block_data)
|
|
|
|
class FileMetadata:
|
|
def __init__(self, path: str, md5: str, file_size: int, blocks: List[BlockMetadata]):
|
|
self.path = path
|
|
self.md5 = md5
|
|
self.file_size = file_size
|
|
self.blocks = blocks
|
|
self.date_added = datetime.now().isoformat()
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
return {
|
|
"p": self.path, # Shortened key names
|
|
"m": self.md5,
|
|
"s": self.file_size,
|
|
"b": [b.to_dict() for b in self.blocks],
|
|
"d": self.date_added
|
|
}
|
|
|
|
@classmethod
|
|
def from_dict(cls, data: Dict[str, Any], block_size: int) -> 'FileMetadata':
|
|
blocks = [BlockMetadata.from_dict(b, block_size) for b in data["b"]]
|
|
instance = cls(data["p"], data["m"], data["s"], blocks)
|
|
instance.date_added = data.get("d", datetime.now().isoformat())
|
|
return instance
|
|
|
|
class MatchingRegion:
|
|
def __init__(self):
|
|
self.source_start = 0
|
|
self.target_start = 0
|
|
self.length = 0
|
|
self.total_similarity = 0
|
|
self.blocks = 0
|
|
self.source_data: List[BlockData] = []
|
|
self.target_data: List[BlockData] = []
|
|
|
|
def add_block(self, source_block: BlockMetadata, target_block: BlockMetadata, similarity: float):
|
|
if self.blocks == 0:
|
|
self.source_start = source_block.start_offset
|
|
self.target_start = target_block.start_offset
|
|
self.length = source_block.data.length
|
|
else:
|
|
self.length += source_block.data.length
|
|
|
|
self.source_data.append(source_block.data)
|
|
self.target_data.append(target_block.data)
|
|
self.total_similarity += similarity
|
|
self.blocks += 1
|
|
|
|
@property
|
|
def avg_similarity(self) -> float:
|
|
return self.total_similarity / self.blocks if self.blocks > 0 else 0
|
|
|
|
def to_dict(self) -> Dict[str, Any]:
|
|
"""When converting for display, create the ASCII and hex representations"""
|
|
results = {
|
|
"source_start": self.source_start,
|
|
"target_start": self.target_start,
|
|
"length": self.length,
|
|
"avg_similarity": self.avg_similarity,
|
|
"blocks": self.blocks,
|
|
"source_data": [],
|
|
"target_data": []
|
|
}
|
|
|
|
# Only create display data when needed
|
|
for src_data in self.source_data:
|
|
results["source_data"].append({
|
|
"ascii_repr": src_data._create_ascii_repr(),
|
|
"hex_dump": src_data._create_hex_dump()
|
|
})
|
|
|
|
for tgt_data in self.target_data:
|
|
results["target_data"].append({
|
|
"ascii_repr": tgt_data._create_ascii_repr(),
|
|
"hex_dump": tgt_data._create_hex_dump()
|
|
})
|
|
|
|
return results
|
|
|
|
class GitRepoInfo:
|
|
def __init__(self, repo_path: str):
|
|
self.repo_path = Path(repo_path)
|
|
|
|
def get_remote_url(self) -> Optional[str]:
|
|
"""Extract the remote URL from .git/config"""
|
|
try:
|
|
git_config_path = self.repo_path / '.git' / 'config'
|
|
if not git_config_path.exists():
|
|
return None
|
|
|
|
config = configparser.ConfigParser()
|
|
config.read(git_config_path)
|
|
|
|
for section in config.sections():
|
|
if section.startswith('remote "origin"'):
|
|
url = config[section].get('url', '')
|
|
if url.startswith('git@github.com:'):
|
|
url = f"https://github.com/{url.split('git@github.com:')[1]}"
|
|
if url.endswith('.git'):
|
|
url = url[:-4]
|
|
return url
|
|
return None
|
|
except Exception:
|
|
return None
|
|
|
|
class FuzzyHashAnalyzer:
|
|
def __init__(self, config, logger=None):
|
|
self.config = config
|
|
self.logger = logger
|
|
self.block_size = 4096
|
|
# Get the base path and create full db path
|
|
fuzzy_base = config['analysis']['doppelganger']['db']['path']
|
|
fuzzy_dir = config['analysis']['doppelganger']['db']['fuzzyhash']
|
|
self.db_path = os.path.join(fuzzy_base, fuzzy_dir, 'FuzzyHash.db')
|
|
self.extensions = [f".{ext}" for ext in config['analysis']['doppelganger']['db'].get('fuzzy_extensions', [])]
|
|
self.db = self._load_db()
|
|
|
|
def _save_db(self):
|
|
"""Save with maximum compression"""
|
|
data = {
|
|
"sources": {
|
|
source: {
|
|
"f": { # Shortened key names
|
|
path: file_data.to_dict()
|
|
for path, file_data in source_data["files"].items()
|
|
},
|
|
"u": source_data["last_updated"]
|
|
}
|
|
for source, source_data in self.db["sources"].items()
|
|
}
|
|
}
|
|
|
|
# Convert to JSON with minimal formatting and compress
|
|
json_str = json.dumps(data, separators=(',', ':'))
|
|
compressed = zlib.compress(json_str.encode('utf-8'), level=9)
|
|
|
|
with open(self.db_path, 'wb') as f:
|
|
f.write(compressed)
|
|
|
|
def _load_db(self) -> Dict:
|
|
"""Load compressed database"""
|
|
empty_db = {"sources": {}}
|
|
|
|
if not os.path.exists(self.db_path):
|
|
if self.logger:
|
|
self.logger.debug(f"Database file {self.db_path} not found. Creating new database.")
|
|
self._save_empty_db()
|
|
return empty_db
|
|
|
|
try:
|
|
with open(self.db_path, 'rb') as f:
|
|
compressed_data = f.read()
|
|
|
|
if not compressed_data:
|
|
return empty_db
|
|
|
|
# Decompress and parse
|
|
json_str = zlib.decompress(compressed_data).decode('utf-8')
|
|
data = json.loads(json_str)
|
|
|
|
# Convert shortened keys back
|
|
converted = {"sources": {}}
|
|
for source, sdata in data["sources"].items():
|
|
converted["sources"][source] = {
|
|
"files": {},
|
|
"last_updated": sdata["u"]
|
|
}
|
|
for path, fdata in sdata["f"].items():
|
|
converted["sources"][source]["files"][path] = FileMetadata.from_dict(
|
|
fdata, self.block_size
|
|
)
|
|
return converted
|
|
|
|
except Exception as e:
|
|
if self.logger:
|
|
self.logger.error(f"Error loading database: {e}")
|
|
self._save_empty_db()
|
|
return empty_db
|
|
|
|
def _save_empty_db(self):
|
|
"""Initialize empty compressed database"""
|
|
empty_data = zlib.compress(json.dumps({"sources": {}}).encode('utf-8'), level=9)
|
|
with open(self.db_path, 'wb') as f:
|
|
f.write(empty_data)
|
|
|
|
def _compute_md5(self, file_path: str) -> str:
|
|
"""Compute MD5 hash of a file"""
|
|
md5_hash = hashlib.md5()
|
|
with open(file_path, "rb") as f:
|
|
for chunk in iter(lambda: f.read(4096), b""):
|
|
md5_hash.update(chunk)
|
|
return md5_hash.hexdigest()
|
|
|
|
def _create_blocks(self, file_path: str) -> List[BlockMetadata]:
|
|
"""Create block metadata with content"""
|
|
blocks = []
|
|
with open(file_path, 'rb') as f:
|
|
index = 0
|
|
while True:
|
|
data = f.read(self.block_size)
|
|
if not data:
|
|
break
|
|
try:
|
|
hash_value = pyssdeep.fuzzy_hash_buf(data, len(data))
|
|
block_data = BlockData(data, index * self.block_size)
|
|
blocks.append(BlockMetadata(index, self.block_size, hash_value, block_data))
|
|
index += 1
|
|
except Exception as e:
|
|
if self.logger:
|
|
self.logger.warning(f"Could not compute fuzzy hash for block {index}: {str(e)}")
|
|
return blocks
|
|
|
|
def compute_file_metadata(self, file_path: str) -> FileMetadata:
|
|
"""Compute complete file metadata including blocks"""
|
|
md5 = self._compute_md5(file_path)
|
|
file_size = os.path.getsize(file_path)
|
|
blocks = self._create_blocks(file_path)
|
|
return FileMetadata(file_path, md5, file_size, blocks)
|
|
|
|
def _compare_blocks(self, blocks1: List[BlockMetadata], blocks2: List[BlockMetadata]) -> Dict:
|
|
"""Compare blocks and find matching regions"""
|
|
matching_regions = []
|
|
current_region = None
|
|
overall_similarity = 0
|
|
total_comparisons = min(len(blocks1), len(blocks2))
|
|
|
|
for b1 in blocks1:
|
|
best_match = {
|
|
"block": None,
|
|
"similarity": 0
|
|
}
|
|
|
|
for b2 in blocks2:
|
|
similarity = pyssdeep.fuzzy_compare(b1.hash, b2.hash)
|
|
if similarity > best_match["similarity"]:
|
|
best_match["similarity"] = similarity
|
|
best_match["block"] = b2
|
|
|
|
if best_match["similarity"] > 0:
|
|
if current_region is None:
|
|
current_region = MatchingRegion()
|
|
|
|
b2 = best_match["block"]
|
|
if (current_region.blocks > 0 and
|
|
b1.start_offset == current_region.source_start + current_region.length and
|
|
b2.start_offset == current_region.target_start + current_region.length):
|
|
# Extend current region
|
|
current_region.add_block(b1, b2, best_match["similarity"])
|
|
else:
|
|
# Start new region
|
|
if current_region.blocks > 0:
|
|
matching_regions.append(current_region)
|
|
current_region = MatchingRegion()
|
|
current_region.add_block(b1, b2, best_match["similarity"])
|
|
|
|
overall_similarity += best_match["similarity"]
|
|
|
|
if current_region and current_region.blocks > 0:
|
|
matching_regions.append(current_region)
|
|
|
|
return {
|
|
"overall_similarity": (overall_similarity / total_comparisons) if total_comparisons > 0 else 0,
|
|
"matching_regions": [region.to_dict() for region in matching_regions],
|
|
"total_regions": len(matching_regions)
|
|
}
|
|
|
|
def find_git_root(self, path: Path) -> Optional[Tuple[str, Path]]:
|
|
"""Find Git repository information"""
|
|
current = path
|
|
while current != current.parent:
|
|
if (current / '.git').is_dir():
|
|
repo_info = GitRepoInfo(current)
|
|
remote_url = repo_info.get_remote_url()
|
|
if remote_url:
|
|
# Convert git URLs to HTTPS format if needed
|
|
if remote_url.startswith('git@github.com:'):
|
|
remote_url = f"https://github.com/{remote_url.split('git@github.com:')[1]}"
|
|
if remote_url.endswith('.git'):
|
|
remote_url = remote_url[:-4]
|
|
return remote_url, current
|
|
current = current.parent
|
|
return "Private Collection", path.parent
|
|
|
|
def create_db_from_folder(self, folder_path: str, extensions: List[str] = None) -> Dict:
|
|
"""Create database from folder with specific file extensions"""
|
|
try:
|
|
folder = Path(folder_path)
|
|
if not folder.exists():
|
|
raise Exception(f"Folder not found: {folder_path}")
|
|
|
|
# Use provided extensions or fall back to configured extensions
|
|
extensions_to_use = extensions if extensions is not None else self.extensions
|
|
|
|
processed = 0
|
|
skipped = 0
|
|
sources_found = set()
|
|
|
|
for file_path in folder.rglob('*'):
|
|
if file_path.is_file():
|
|
if extensions_to_use and file_path.suffix.lower() not in extensions_to_use:
|
|
skipped += 1
|
|
continue
|
|
|
|
if '.git' in file_path.parts:
|
|
skipped += 1
|
|
continue
|
|
|
|
try:
|
|
repo_info = self.find_git_root(file_path)
|
|
if not repo_info:
|
|
print(f"Warning: No git repository found for {file_path}")
|
|
skipped += 1
|
|
continue
|
|
|
|
source_url, repo_root = repo_info
|
|
source = source_url
|
|
|
|
if source not in self.db["sources"]:
|
|
self.db["sources"][source] = {
|
|
"files": {},
|
|
"last_updated": datetime.now().isoformat()
|
|
}
|
|
file_metadata = self.compute_file_metadata(str(file_path))
|
|
rel_path = str(file_path.relative_to(folder))
|
|
|
|
self.db["sources"][source]["files"][rel_path] = file_metadata
|
|
processed += 1
|
|
sources_found.add(source)
|
|
|
|
if self.logger:
|
|
self.logger.debug(f"Processed: {rel_path}")
|
|
|
|
except Exception as e:
|
|
if self.logger:
|
|
self.logger.error(f"Error processing {file_path}: {str(e)}")
|
|
skipped += 1
|
|
|
|
self._save_db()
|
|
return {
|
|
"processed": processed,
|
|
"skipped": skipped,
|
|
"total": processed + skipped,
|
|
"sources": list(sources_found)
|
|
}
|
|
|
|
except Exception as e:
|
|
if self.logger:
|
|
self.logger.error(f"Database creation failed: {str(e)}")
|
|
raise Exception(f"Database creation failed: {str(e)}")
|
|
|
|
def analyze_files(self, file_paths: List[str], threshold: int = 1) -> List[Dict]:
|
|
"""Analyze files against the database - returns only top 3 matches per file"""
|
|
results = []
|
|
batch_size = 10
|
|
|
|
for i in range(0, len(file_paths), batch_size):
|
|
batch = file_paths[i:i + batch_size]
|
|
batch_metadata = []
|
|
|
|
for file_path in batch:
|
|
try:
|
|
metadata = self.compute_file_metadata(file_path)
|
|
batch_metadata.append((file_path, metadata))
|
|
except Exception as e:
|
|
if self.logger:
|
|
self.logger.error(f"Error processing {file_path}: {str(e)}")
|
|
continue
|
|
|
|
for file_path, current_metadata in batch_metadata:
|
|
matches = []
|
|
for source_url, source_data in self.db["sources"].items():
|
|
for rel_path, file_data in source_data["files"].items():
|
|
comparison = self._compare_blocks(current_metadata.blocks, file_data.blocks)
|
|
|
|
if comparison["overall_similarity"] >= threshold:
|
|
match = {
|
|
"source": source_url,
|
|
"file": rel_path,
|
|
"overall_similarity": comparison["overall_similarity"],
|
|
"md5": file_data.md5,
|
|
"matching_regions": comparison["matching_regions"],
|
|
"total_regions": comparison["total_regions"],
|
|
"target_size": file_data.file_size,
|
|
"date_added": file_data.date_added
|
|
}
|
|
matches.append(match)
|
|
|
|
# Sort matches by similarity and take top 3
|
|
sorted_matches = sorted(matches, key=lambda x: x["overall_similarity"], reverse=True)[:3]
|
|
|
|
result = {
|
|
"file": os.path.basename(file_path),
|
|
"path": file_path,
|
|
"md5": current_metadata.md5,
|
|
"file_size": current_metadata.file_size,
|
|
"total_blocks": len(current_metadata.blocks),
|
|
"matches": sorted_matches, # Now contains only top 3
|
|
"total_matches": len(matches) # Keep total count of all matches
|
|
}
|
|
results.append(result)
|
|
|
|
return results
|
|
|
|
def get_db_stats(self) -> Dict[str, Any]:
|
|
"""Get comprehensive database statistics"""
|
|
stats = {
|
|
"total_files": 0,
|
|
"total_size": 0, # Total size of indexed files
|
|
"db_size": 0, # Size of the database file itself
|
|
"sources": {},
|
|
"last_updated": None
|
|
}
|
|
|
|
# Get database file size
|
|
try:
|
|
stats["db_size"] = os.path.getsize(self.db_path)
|
|
stats["db_size_human"] = self._format_size(stats["db_size"])
|
|
except (OSError, IOError) as e:
|
|
if self.logger:
|
|
self.logger.error(f"Error getting database file size: {e}")
|
|
stats["db_size"] = 0
|
|
stats["db_size_human"] = "0 B"
|
|
|
|
# Count files and sources
|
|
for source, source_data in self.db["sources"].items():
|
|
source_stats = {
|
|
"file_count": len(source_data["files"]),
|
|
"last_updated": source_data["last_updated"]
|
|
}
|
|
|
|
stats["total_files"] += source_stats["file_count"]
|
|
for file_data in source_data["files"].values():
|
|
stats["total_size"] += file_data.file_size
|
|
|
|
stats["sources"][source] = source_stats
|
|
|
|
# Track most recent update
|
|
source_date = datetime.fromisoformat(source_data["last_updated"])
|
|
if not stats["last_updated"] or source_date > datetime.fromisoformat(stats["last_updated"]):
|
|
stats["last_updated"] = source_data["last_updated"]
|
|
|
|
stats["total_size_human"] = self._format_size(stats["total_size"])
|
|
|
|
return stats
|
|
|
|
def _format_size(self, size: int) -> str:
|
|
"""Convert bytes to human readable format"""
|
|
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
|
|
if size < 1024 or unit == 'TB':
|
|
return f"{size:.2f} {unit}"
|
|
size /= 1024
|
|
|
|
def print_matching_region(self, region: Dict[str, Any], source_file: str, target_file: str) -> Dict[str, Any]:
|
|
"""Generate detailed hex and ASCII comparison of matching regions"""
|
|
comparison_data = {
|
|
"similarity": round(region['avg_similarity'], 2),
|
|
"length": region['length'],
|
|
"source_file": source_file,
|
|
"source_range": {
|
|
"start": f"{region['source_start']:08x}",
|
|
"end": f"{region['source_start'] + region['length']:08x}"
|
|
},
|
|
"target_file": target_file,
|
|
"target_range": {
|
|
"start": f"{region['target_start']:08x}",
|
|
"end": f"{region['target_start'] + region['length']:08x}"
|
|
},
|
|
"source_data": [],
|
|
"target_data": []
|
|
}
|
|
|
|
# Process source data
|
|
for block_data in region['source_data']:
|
|
ascii_lines = block_data['ascii_repr'].split('\n')
|
|
hex_lines = block_data['hex_dump'].split('\n')
|
|
for ascii_line, hex_line in zip(ascii_lines, hex_lines):
|
|
if set(ascii_line) != {'.'}: # Only include lines with non-dot characters
|
|
comparison_data["source_data"].append({
|
|
"ascii": ascii_line,
|
|
"hex": hex_line
|
|
})
|
|
|
|
# Process target data
|
|
for block_data in region['target_data']:
|
|
ascii_lines = block_data['ascii_repr'].split('\n')
|
|
hex_lines = block_data['hex_dump'].split('\n')
|
|
for ascii_line, hex_line in zip(ascii_lines, hex_lines):
|
|
if set(ascii_line) != {'.'}: # Only include lines with non-dot characters
|
|
comparison_data["target_data"].append({
|
|
"ascii": ascii_line,
|
|
"hex": hex_line
|
|
})
|
|
|
|
if self.logger:
|
|
self.logger.debug(f"Processed matching region comparison for {source_file} and {target_file}")
|
|
|
|
return comparison_data |