Files
litterbox/app/analyzers/fuzzy.py
T
2025-02-16 14:52:10 -08:00

569 lines
23 KiB
Python

# app/analyzers/fuzzy.py
import pyssdeep
import json
import os
import hashlib
import configparser
import zlib
from datetime import datetime
from typing import Dict, List, Optional, Tuple, Any
from pathlib import Path
import binascii
class BlockData:
def __init__(self, raw_data: bytes, start_offset: int):
self.raw_data = raw_data
self.start_offset = start_offset
self.length = len(raw_data)
def _create_hex_dump(self) -> str:
"""Only created when displaying results"""
hex_lines = []
for i in range(0, self.length, 16):
chunk = self.raw_data[i:i + 16]
hex_values = ' '.join(f'{b:02x}' for b in chunk)
hex_values = hex_values.ljust(48)
hex_lines.append(f"{self.start_offset + i:08x} {hex_values}")
return '\n'.join(hex_lines)
def _create_ascii_repr(self) -> str:
"""Only created when displaying results"""
ascii_lines = []
for i in range(0, self.length, 16):
chunk = self.raw_data[i:i + 16]
ascii_str = ''.join(chr(b) if 32 <= b <= 126 else '.' for b in chunk)
ascii_lines.append(ascii_str)
return '\n'.join(ascii_lines)
def to_dict(self) -> Dict[str, Any]:
"""Store absolute minimum in DB, converting bytes to base64 string"""
compressed = zlib.compress(self.raw_data)
return {
"o": self.start_offset, # Shortened key names
"d": binascii.b2a_base64(compressed).decode('ascii').strip() # Compress and convert to base64
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'BlockData':
"""Reconstruct from minimal data"""
compressed = binascii.a2b_base64(data["d"]) # Convert base64 back to bytes
raw_data = zlib.decompress(compressed)
return cls(raw_data, data["o"])
class BlockMetadata:
def __init__(self, index: int, block_size: int, hash_value: str, data: BlockData):
self.index = index
self.start_offset = data.start_offset
self.end_offset = data.start_offset + data.length
self.hash = hash_value
self.data = data
def to_dict(self) -> Dict[str, Any]:
return {
"i": self.index, # Shortened key names
"h": self.hash,
"d": self.data.to_dict()
}
@classmethod
def from_dict(cls, data: Dict[str, Any], block_size: int) -> 'BlockMetadata':
block_data = BlockData.from_dict(data["d"])
return cls(data["i"], block_size, data["h"], block_data)
class FileMetadata:
def __init__(self, path: str, md5: str, file_size: int, blocks: List[BlockMetadata]):
self.path = path
self.md5 = md5
self.file_size = file_size
self.blocks = blocks
self.date_added = datetime.now().isoformat()
def to_dict(self) -> Dict[str, Any]:
return {
"p": self.path, # Shortened key names
"m": self.md5,
"s": self.file_size,
"b": [b.to_dict() for b in self.blocks],
"d": self.date_added
}
@classmethod
def from_dict(cls, data: Dict[str, Any], block_size: int) -> 'FileMetadata':
blocks = [BlockMetadata.from_dict(b, block_size) for b in data["b"]]
instance = cls(data["p"], data["m"], data["s"], blocks)
instance.date_added = data.get("d", datetime.now().isoformat())
return instance
class MatchingRegion:
def __init__(self):
self.source_start = 0
self.target_start = 0
self.length = 0
self.total_similarity = 0
self.blocks = 0
self.source_data: List[BlockData] = []
self.target_data: List[BlockData] = []
def add_block(self, source_block: BlockMetadata, target_block: BlockMetadata, similarity: float):
if self.blocks == 0:
self.source_start = source_block.start_offset
self.target_start = target_block.start_offset
self.length = source_block.data.length
else:
self.length += source_block.data.length
self.source_data.append(source_block.data)
self.target_data.append(target_block.data)
self.total_similarity += similarity
self.blocks += 1
@property
def avg_similarity(self) -> float:
return self.total_similarity / self.blocks if self.blocks > 0 else 0
def to_dict(self) -> Dict[str, Any]:
"""When converting for display, create the ASCII and hex representations"""
results = {
"source_start": self.source_start,
"target_start": self.target_start,
"length": self.length,
"avg_similarity": self.avg_similarity,
"blocks": self.blocks,
"source_data": [],
"target_data": []
}
# Only create display data when needed
for src_data in self.source_data:
results["source_data"].append({
"ascii_repr": src_data._create_ascii_repr(),
"hex_dump": src_data._create_hex_dump()
})
for tgt_data in self.target_data:
results["target_data"].append({
"ascii_repr": tgt_data._create_ascii_repr(),
"hex_dump": tgt_data._create_hex_dump()
})
return results
class GitRepoInfo:
def __init__(self, repo_path: str):
self.repo_path = Path(repo_path)
def get_remote_url(self) -> Optional[str]:
"""Extract the remote URL from .git/config"""
try:
git_config_path = self.repo_path / '.git' / 'config'
if not git_config_path.exists():
return None
config = configparser.ConfigParser()
config.read(git_config_path)
for section in config.sections():
if section.startswith('remote "origin"'):
url = config[section].get('url', '')
if url.startswith('git@github.com:'):
url = f"https://github.com/{url.split('git@github.com:')[1]}"
if url.endswith('.git'):
url = url[:-4]
return url
return None
except Exception:
return None
class FuzzyHashAnalyzer:
def __init__(self, config, logger=None):
self.config = config
self.logger = logger
self.block_size = 4096
# Get the base path and create full db path
fuzzy_base = config['analysis']['doppelganger']['db']['path']
fuzzy_dir = config['analysis']['doppelganger']['db']['fuzzyhash']
self.db_path = os.path.join(fuzzy_base, fuzzy_dir, 'FuzzyHash.db')
self.extensions = [f".{ext}" for ext in config['analysis']['doppelganger']['db'].get('fuzzy_extensions', [])]
self.db = self._load_db()
def _save_db(self):
"""Save with maximum compression"""
data = {
"sources": {
source: {
"f": { # Shortened key names
path: file_data.to_dict()
for path, file_data in source_data["files"].items()
},
"u": source_data["last_updated"]
}
for source, source_data in self.db["sources"].items()
}
}
# Convert to JSON with minimal formatting and compress
json_str = json.dumps(data, separators=(',', ':'))
compressed = zlib.compress(json_str.encode('utf-8'), level=9)
with open(self.db_path, 'wb') as f:
f.write(compressed)
def _load_db(self) -> Dict:
"""Load compressed database"""
empty_db = {"sources": {}}
if not os.path.exists(self.db_path):
if self.logger:
self.logger.debug(f"Database file {self.db_path} not found. Creating new database.")
self._save_empty_db()
return empty_db
try:
with open(self.db_path, 'rb') as f:
compressed_data = f.read()
if not compressed_data:
return empty_db
# Decompress and parse
json_str = zlib.decompress(compressed_data).decode('utf-8')
data = json.loads(json_str)
# Convert shortened keys back
converted = {"sources": {}}
for source, sdata in data["sources"].items():
converted["sources"][source] = {
"files": {},
"last_updated": sdata["u"]
}
for path, fdata in sdata["f"].items():
converted["sources"][source]["files"][path] = FileMetadata.from_dict(
fdata, self.block_size
)
return converted
except Exception as e:
if self.logger:
self.logger.error(f"Error loading database: {e}")
self._save_empty_db()
return empty_db
def _save_empty_db(self):
"""Initialize empty compressed database"""
empty_data = zlib.compress(json.dumps({"sources": {}}).encode('utf-8'), level=9)
with open(self.db_path, 'wb') as f:
f.write(empty_data)
def _compute_md5(self, file_path: str) -> str:
"""Compute MD5 hash of a file"""
md5_hash = hashlib.md5()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
md5_hash.update(chunk)
return md5_hash.hexdigest()
def _create_blocks(self, file_path: str) -> List[BlockMetadata]:
"""Create block metadata with content"""
blocks = []
with open(file_path, 'rb') as f:
index = 0
while True:
data = f.read(self.block_size)
if not data:
break
try:
hash_value = pyssdeep.fuzzy_hash_buf(data, len(data))
block_data = BlockData(data, index * self.block_size)
blocks.append(BlockMetadata(index, self.block_size, hash_value, block_data))
index += 1
except Exception as e:
if self.logger:
self.logger.warning(f"Could not compute fuzzy hash for block {index}: {str(e)}")
return blocks
def compute_file_metadata(self, file_path: str) -> FileMetadata:
"""Compute complete file metadata including blocks"""
md5 = self._compute_md5(file_path)
file_size = os.path.getsize(file_path)
blocks = self._create_blocks(file_path)
return FileMetadata(file_path, md5, file_size, blocks)
def _compare_blocks(self, blocks1: List[BlockMetadata], blocks2: List[BlockMetadata]) -> Dict:
"""Compare blocks and find matching regions"""
matching_regions = []
current_region = None
overall_similarity = 0
total_comparisons = min(len(blocks1), len(blocks2))
for b1 in blocks1:
best_match = {
"block": None,
"similarity": 0
}
for b2 in blocks2:
similarity = pyssdeep.fuzzy_compare(b1.hash, b2.hash)
if similarity > best_match["similarity"]:
best_match["similarity"] = similarity
best_match["block"] = b2
if best_match["similarity"] > 0:
if current_region is None:
current_region = MatchingRegion()
b2 = best_match["block"]
if (current_region.blocks > 0 and
b1.start_offset == current_region.source_start + current_region.length and
b2.start_offset == current_region.target_start + current_region.length):
# Extend current region
current_region.add_block(b1, b2, best_match["similarity"])
else:
# Start new region
if current_region.blocks > 0:
matching_regions.append(current_region)
current_region = MatchingRegion()
current_region.add_block(b1, b2, best_match["similarity"])
overall_similarity += best_match["similarity"]
if current_region and current_region.blocks > 0:
matching_regions.append(current_region)
return {
"overall_similarity": (overall_similarity / total_comparisons) if total_comparisons > 0 else 0,
"matching_regions": [region.to_dict() for region in matching_regions],
"total_regions": len(matching_regions)
}
def find_git_root(self, path: Path) -> Optional[Tuple[str, Path]]:
"""Find Git repository information"""
current = path
while current != current.parent:
if (current / '.git').is_dir():
repo_info = GitRepoInfo(current)
remote_url = repo_info.get_remote_url()
if remote_url:
# Convert git URLs to HTTPS format if needed
if remote_url.startswith('git@github.com:'):
remote_url = f"https://github.com/{remote_url.split('git@github.com:')[1]}"
if remote_url.endswith('.git'):
remote_url = remote_url[:-4]
return remote_url, current
current = current.parent
return "Private Collection", path.parent
def create_db_from_folder(self, folder_path: str, extensions: List[str] = None) -> Dict:
"""Create database from folder with specific file extensions"""
try:
folder = Path(folder_path)
if not folder.exists():
raise Exception(f"Folder not found: {folder_path}")
# Use provided extensions or fall back to configured extensions
extensions_to_use = extensions if extensions is not None else self.extensions
processed = 0
skipped = 0
sources_found = set()
for file_path in folder.rglob('*'):
if file_path.is_file():
if extensions_to_use and file_path.suffix.lower() not in extensions_to_use:
skipped += 1
continue
if '.git' in file_path.parts:
skipped += 1
continue
try:
repo_info = self.find_git_root(file_path)
if not repo_info:
print(f"Warning: No git repository found for {file_path}")
skipped += 1
continue
source_url, repo_root = repo_info
source = source_url
if source not in self.db["sources"]:
self.db["sources"][source] = {
"files": {},
"last_updated": datetime.now().isoformat()
}
file_metadata = self.compute_file_metadata(str(file_path))
rel_path = str(file_path.relative_to(folder))
self.db["sources"][source]["files"][rel_path] = file_metadata
processed += 1
sources_found.add(source)
if self.logger:
self.logger.debug(f"Processed: {rel_path}")
except Exception as e:
if self.logger:
self.logger.error(f"Error processing {file_path}: {str(e)}")
skipped += 1
self._save_db()
return {
"processed": processed,
"skipped": skipped,
"total": processed + skipped,
"sources": list(sources_found)
}
except Exception as e:
if self.logger:
self.logger.error(f"Database creation failed: {str(e)}")
raise Exception(f"Database creation failed: {str(e)}")
def analyze_files(self, file_paths: List[str], threshold: int = 1) -> List[Dict]:
"""Analyze files against the database - returns only top 3 matches per file"""
results = []
batch_size = 10
for i in range(0, len(file_paths), batch_size):
batch = file_paths[i:i + batch_size]
batch_metadata = []
for file_path in batch:
try:
metadata = self.compute_file_metadata(file_path)
batch_metadata.append((file_path, metadata))
except Exception as e:
if self.logger:
self.logger.error(f"Error processing {file_path}: {str(e)}")
continue
for file_path, current_metadata in batch_metadata:
matches = []
for source_url, source_data in self.db["sources"].items():
for rel_path, file_data in source_data["files"].items():
comparison = self._compare_blocks(current_metadata.blocks, file_data.blocks)
if comparison["overall_similarity"] >= threshold:
match = {
"source": source_url,
"file": rel_path,
"overall_similarity": comparison["overall_similarity"],
"md5": file_data.md5,
"matching_regions": comparison["matching_regions"],
"total_regions": comparison["total_regions"],
"target_size": file_data.file_size,
"date_added": file_data.date_added
}
matches.append(match)
# Sort matches by similarity and take top 3
sorted_matches = sorted(matches, key=lambda x: x["overall_similarity"], reverse=True)[:3]
result = {
"file": os.path.basename(file_path),
"path": file_path,
"md5": current_metadata.md5,
"file_size": current_metadata.file_size,
"total_blocks": len(current_metadata.blocks),
"matches": sorted_matches, # Now contains only top 3
"total_matches": len(matches) # Keep total count of all matches
}
results.append(result)
return results
def get_db_stats(self) -> Dict[str, Any]:
"""Get comprehensive database statistics"""
stats = {
"total_files": 0,
"total_size": 0, # Total size of indexed files
"db_size": 0, # Size of the database file itself
"sources": {},
"last_updated": None
}
# Get database file size
try:
stats["db_size"] = os.path.getsize(self.db_path)
stats["db_size_human"] = self._format_size(stats["db_size"])
except (OSError, IOError) as e:
if self.logger:
self.logger.error(f"Error getting database file size: {e}")
stats["db_size"] = 0
stats["db_size_human"] = "0 B"
# Count files and sources
for source, source_data in self.db["sources"].items():
source_stats = {
"file_count": len(source_data["files"]),
"last_updated": source_data["last_updated"]
}
stats["total_files"] += source_stats["file_count"]
for file_data in source_data["files"].values():
stats["total_size"] += file_data.file_size
stats["sources"][source] = source_stats
# Track most recent update
source_date = datetime.fromisoformat(source_data["last_updated"])
if not stats["last_updated"] or source_date > datetime.fromisoformat(stats["last_updated"]):
stats["last_updated"] = source_data["last_updated"]
stats["total_size_human"] = self._format_size(stats["total_size"])
return stats
def _format_size(self, size: int) -> str:
"""Convert bytes to human readable format"""
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if size < 1024 or unit == 'TB':
return f"{size:.2f} {unit}"
size /= 1024
def print_matching_region(self, region: Dict[str, Any], source_file: str, target_file: str) -> Dict[str, Any]:
"""Generate detailed hex and ASCII comparison of matching regions"""
comparison_data = {
"similarity": round(region['avg_similarity'], 2),
"length": region['length'],
"source_file": source_file,
"source_range": {
"start": f"{region['source_start']:08x}",
"end": f"{region['source_start'] + region['length']:08x}"
},
"target_file": target_file,
"target_range": {
"start": f"{region['target_start']:08x}",
"end": f"{region['target_start'] + region['length']:08x}"
},
"source_data": [],
"target_data": []
}
# Process source data
for block_data in region['source_data']:
ascii_lines = block_data['ascii_repr'].split('\n')
hex_lines = block_data['hex_dump'].split('\n')
for ascii_line, hex_line in zip(ascii_lines, hex_lines):
if set(ascii_line) != {'.'}: # Only include lines with non-dot characters
comparison_data["source_data"].append({
"ascii": ascii_line,
"hex": hex_line
})
# Process target data
for block_data in region['target_data']:
ascii_lines = block_data['ascii_repr'].split('\n')
hex_lines = block_data['hex_dump'].split('\n')
for ascii_line, hex_line in zip(ascii_lines, hex_lines):
if set(ascii_line) != {'.'}: # Only include lines with non-dot characters
comparison_data["target_data"].append({
"ascii": ascii_line,
"hex": hex_line
})
if self.logger:
self.logger.debug(f"Processed matching region comparison for {source_file} and {target_file}")
return comparison_data