diskm8/data.go

943 lines
20 KiB
Go

package main
import (
"crypto/md5"
"encoding/gob"
"encoding/hex"
"errors"
"fmt"
"os"
"path/filepath"
"runtime"
"strings"
"time"
"github.com/paleotronic/diskm8/disk"
"github.com/paleotronic/diskm8/loggy"
)
type Disk struct {
FullPath string
Filename string
SHA256 string // Sha of whole disk
SHA256Active string // Sha of active sectors/blocks only
Format string
FormatID disk.DiskFormat
Bitmap []bool
Tracks, Sectors, Blocks int
Files DiskCatalog
ActiveSectors DiskSectors
//ActiveBlocks DiskBlocks
InactiveSectors DiskSectors
//InactiveBlocks DiskBlocks
MatchFactor float64
MatchFiles map[*DiskFile]*DiskFile
MissingFiles, ExtraFiles []*DiskFile
IngestMode int
source string
}
type ByMatchFactor []*Disk
func (s ByMatchFactor) Len() int {
return len(s)
}
func (s ByMatchFactor) Swap(i, j int) {
s[i], s[j] = s[j], s[i]
}
func (s ByMatchFactor) Less(i, j int) bool {
return s[i].MatchFactor < s[j].MatchFactor
}
type TypeCode int
const (
TypeMask_AppleDOS TypeCode = 0x0000
TypeMask_ProDOS TypeCode = 0x0100
TypeMask_Pascal TypeCode = 0x0200
TypeMask_RDOS TypeCode = 0x0300
)
type DiskFile struct {
Filename string
Type string
Ext string
TypeCode TypeCode
SHA256 string
Size int
LoadAddress int
Text []byte
Data []byte
Locked bool
Created time.Time
Modified time.Time
}
func (d *DiskFile) GetNameAdorned() string {
var ext string
switch d.TypeCode & 0xff00 {
case TypeMask_AppleDOS:
ext = disk.FileType(d.TypeCode & 0xff).Ext()
case TypeMask_ProDOS:
ext = disk.ProDOSFileType(d.TypeCode & 0xff).Ext()
case TypeMask_RDOS:
ext = disk.RDOSFileType(d.TypeCode & 0xff).Ext()
case TypeMask_Pascal:
ext = disk.PascalFileType(d.TypeCode & 0xff).Ext()
}
return fmt.Sprintf("%s#0x%.4x.%s", d.Filename, d.LoadAddress, ext)
}
func (d *DiskFile) GetName() string {
var ext string
switch d.TypeCode & 0xff00 {
case TypeMask_AppleDOS:
ext = disk.FileType(d.TypeCode & 0xff).Ext()
case TypeMask_ProDOS:
ext = disk.ProDOSFileType(d.TypeCode & 0xff).Ext()
case TypeMask_RDOS:
ext = disk.RDOSFileType(d.TypeCode & 0xff).Ext()
case TypeMask_Pascal:
ext = disk.PascalFileType(d.TypeCode & 0xff).Ext()
}
return fmt.Sprintf("%s.%s", d.Filename, ext)
}
type DiskCatalog []*DiskFile
type DiskSectors []*DiskSector
type DiskBlocks []*DiskBlock
type DiskSector struct {
Track int
Sector int
SHA256 string
Data []byte
}
type DiskBlock struct {
Block int
SHA256 string
}
func (i Disk) LogBitmap(id int) {
l := loggy.Get(id)
if i.Tracks > 0 {
for t := 0; t < i.Tracks; t++ {
line := fmt.Sprintf("Track %.2d: ", t)
for s := 0; s < i.Sectors; s++ {
if i.Bitmap[t*i.Sectors+s] {
line += fmt.Sprintf("%.2x ", s)
} else {
line += ":: "
}
}
l.Logf("%s", line)
}
} else if i.Blocks > 0 {
tr := i.Blocks / 16
sc := 16
for t := 0; t < tr; t++ {
line := fmt.Sprintf("Block %.2d: ", t)
for s := 0; s < sc; s++ {
if i.Bitmap[t*i.Sectors+s] {
line += fmt.Sprintf("%.2x ", s)
} else {
line += ":: "
}
}
l.Logf("%s", line)
}
}
}
func (d Disk) GetFilename() string {
sum := md5.Sum([]byte(d.Filename))
// fmt.Printf("checksum: [%s] -> [%s]\n", d.Filename, hex.EncodeToString(sum[:]))
ff := fmt.Sprintf("%s/%d", strings.Trim(filepath.Dir(d.FullPath), "/"), d.FormatID.ID) + "_" + d.SHA256 + "_" + d.SHA256Active + "_" + hex.EncodeToString(sum[:]) + ".fgp"
if runtime.GOOS == "windows" {
ff = strings.Replace(ff, ":", "", -1)
ff = strings.Replace(ff, "\\", "/", -1)
}
return ff
}
func (d Disk) WriteToFile(filename string) error {
// b, err := yaml.Marshal(d)
// if err != nil {
// return err
// }
l := loggy.Get(0)
_ = os.MkdirAll(filepath.Dir(filename), 0755)
f, err := os.Create(filename)
if err != nil {
return err
}
defer f.Close()
enc := gob.NewEncoder(f)
enc.Encode(d)
l.Logf("Created %s", filename)
return nil
}
func (d *Disk) ReadFromFile(filename string) error {
// b, err := ioutil.ReadFile(filename)
// if err != nil {
// return err
// }
// err = yaml.Unmarshal(b, d)
f, err := os.Open(filename)
if err != nil {
return err
}
defer f.Close()
dec := gob.NewDecoder(f)
err = dec.Decode(d)
d.source = filename
return err
}
// GetExactBinaryMatches returns disks with the same Global SHA256
func (d *Disk) GetExactBinaryMatches(filter []string) []*Disk {
l := loggy.Get(0)
var out []*Disk = make([]*Disk, 0)
exists, matches := existsPattern(*baseName, filter, fmt.Sprintf("%d", d.FormatID)+"_"+d.SHA256+"_*_*.fgp")
if !exists {
return out
}
for _, m := range matches {
l.Logf(":: Checking %s", m)
if item, err := cache.Get(m); err == nil {
if item.FullPath != d.FullPath {
out = append(out, item)
}
}
}
return out
}
// GetActiveSectorBinaryMatches returns disks with the same Active SHA256
func (d *Disk) GetActiveSectorBinaryMatches(filter []string) []*Disk {
l := loggy.Get(0)
var out []*Disk = make([]*Disk, 0)
exists, matches := existsPattern(*baseName, filter, fmt.Sprintf("%d", d.FormatID)+"_*_"+d.SHA256Active+"_*.fgp")
if !exists {
return out
}
for _, m := range matches {
l.Logf(":: Checking %s", m)
if item, err := cache.Get(m); err == nil {
if item.FullPath != d.FullPath {
out = append(out, item)
}
}
}
return out
}
func (d *Disk) GetFileMap() map[string]*DiskFile {
out := make(map[string]*DiskFile)
for _, file := range d.Files {
f := file
out[file.SHA256] = f
}
return out
}
func (d *Disk) GetUtilizationMap() map[string]string {
out := make(map[string]string)
if len(d.ActiveSectors) > 0 {
for _, block := range d.ActiveSectors {
key := fmt.Sprintf("T%dS%d", block.Track, block.Sector)
out[key] = block.SHA256
}
}
return out
}
// CompareChunks returns a value 0-1
func (d *Disk) CompareChunks(b *Disk) (float64, float64, float64, float64) {
l := loggy.Get(0)
if d.FormatID != b.FormatID {
l.Logf("Trying to compare disks of different types")
return 0, 0, 0, 0
}
switch d.FormatID.ID {
case disk.DF_RDOS_3:
return d.compareSectorsPositional(b)
case disk.DF_RDOS_32:
return d.compareSectorsPositional(b)
case disk.DF_RDOS_33:
return d.compareSectorsPositional(b)
case disk.DF_PASCAL:
return d.compareBlocksPositional(b)
case disk.DF_DOS_SECTORS_13:
return d.compareSectorsPositional(b)
case disk.DF_DOS_SECTORS_16:
return d.compareSectorsPositional(b)
case disk.DF_PRODOS:
return d.compareBlocksPositional(b)
case disk.DF_PRODOS_800KB:
return d.compareBlocksPositional(b)
}
return 0, 0, 0, 0
}
func (d *Disk) compareSectorsPositional(b *Disk) (float64, float64, float64, float64) {
l := loggy.Get(0)
var sameSectors float64
var diffSectors float64
var dNotb float64
var bNotd float64
var emptySectors float64
var dTotal, bTotal float64
var dmap = d.GetUtilizationMap()
var bmap = b.GetUtilizationMap()
for t := 0; t < d.FormatID.TPD(); t++ {
for s := 0; s < d.FormatID.SPT(); s++ {
key := fmt.Sprintf("T%dS%d", t, s)
dCk, dEx := dmap[key]
bCk, bEx := bmap[key]
switch {
case dEx && bEx:
if dCk == bCk {
sameSectors += 1
} else {
diffSectors += 1
}
dTotal += 1
bTotal += 1
case dEx && !bEx:
dNotb += 1
dTotal += 1
case !dEx && bEx:
bNotd += 1
bTotal += 1
default:
emptySectors += 1
}
}
}
l.Debugf("Same Sectors : %f", sameSectors)
l.Debugf("Differing Sectors: %f", diffSectors)
l.Debugf("Not in other disk: %f", dNotb)
l.Debugf("Not in this disk : %f", bNotd)
// return sameSectors / dTotal, sameSectors / bTotal, diffSectors / dTotal, diffSectors / btotal
return sameSectors / dTotal, sameSectors / bTotal, diffSectors / dTotal, diffSectors / bTotal
}
func (d *Disk) compareBlocksPositional(b *Disk) (float64, float64, float64, float64) {
l := loggy.Get(0)
var sameSectors float64
var diffSectors float64
var dNotb float64
var bNotd float64
var emptySectors float64
var dTotal, bTotal float64
var dmap = d.GetUtilizationMap()
var bmap = b.GetUtilizationMap()
for t := 0; t < d.FormatID.BPD(); t++ {
key := fmt.Sprintf("B%d", t)
dCk, dEx := dmap[key]
bCk, bEx := bmap[key]
switch {
case dEx && bEx:
if dCk == bCk {
sameSectors += 1
} else {
diffSectors += 1
}
dTotal += 1
bTotal += 1
case dEx && !bEx:
dNotb += 1
dTotal += 1
case !dEx && bEx:
bNotd += 1
bTotal += 1
default:
emptySectors += 1
}
}
l.Debugf("Same Blocks : %f", sameSectors)
l.Debugf("Differing Blocks : %f", diffSectors)
l.Debugf("Not in other disk: %f", dNotb)
l.Debugf("Not in this disk : %f", bNotd)
// return sameSectors / dTotal, sameSectors / bTotal, diffSectors / dTotal, diffSectors / btotal
return sameSectors / dTotal, sameSectors / bTotal, diffSectors / dTotal, diffSectors / bTotal
}
// GetActiveSectorBinaryMatches returns disks with the same Active SHA256
func (d *Disk) GetPartialMatches(filter []string) ([]*Disk, []*Disk, []*Disk) {
l := loggy.Get(0)
var superset []*Disk = make([]*Disk, 0)
var subset []*Disk = make([]*Disk, 0)
var identical []*Disk = make([]*Disk, 0)
exists, matches := existsPattern(*baseName, filter, fmt.Sprintf("%d", d.FormatID)+"_*_*_*.fgp")
if !exists {
return superset, subset, identical
}
for _, m := range matches {
//item := &Disk{}
if item, err := cache.Get(m); err == nil {
if item.FullPath != d.FullPath {
// only here if not looking at same disk
l.Logf(":: Checking overlapping data blocks %s", item.Filename)
l.Log("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
dSame, iSame, dDiff, iDiff := d.CompareChunks(item)
l.Logf("== This disk shares %.2f percent of its allocated blocks with %s", dSame*100, item.Filename)
l.Logf("!= This disk differs %.2f percent of its allocate blocks with %s", dDiff*100, item.Filename)
l.Logf("== %s shares %.2f of its blocks with this disk", item.Filename, iSame*100)
l.Logf("!= %s differs %.2f of its blocks with this disk", item.Filename, iDiff*100)
if dSame == 1 && iSame < 1 {
superset = append(superset, item)
} else if iSame == 1 && dSame < 1 {
subset = append(subset, item)
} else if iSame == 1 && dSame == 1 {
identical = append(identical, item)
}
}
}
}
return superset, subset, identical
}
func (d *Disk) GetPartialMatchesWithThreshold(t float64, filter []string) []*Disk {
l := loggy.Get(0)
var matchlist []*Disk = make([]*Disk, 0)
exists, matches := existsPattern(*baseName, filter, fmt.Sprintf("%d", d.FormatID)+"_*_*_*.fgp")
if !exists {
return matchlist
}
var lastPc int = -1
for i, m := range matches {
//item := &Disk{}
if item, err := cache.Get(m); err == nil {
pc := int(100 * float64(i) / float64(len(matches)))
if pc != lastPc {
os.Stderr.WriteString(fmt.Sprintf("Analyzing volumes... %d%% ", pc))
}
if item.FullPath != d.FullPath {
// only here if not looking at same disk
l.Logf(":: Checking overlapping data blocks %s", item.Filename)
// l.Log("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
dSame, _, _, _ := d.CompareChunks(item)
// l.Logf("== This disk shares %.2f percent of its allocated blocks with %s", dSame*100, item.Filename)
// l.Logf("!= This disk differs %.2f percent of its allocate blocks with %s", dDiff*100, item.Filename)
// l.Logf("== %s shares %.2f of its blocks with this disk", item.Filename, iSame*100)
// l.Logf("!= %s differs %.2f of its blocks with this disk", item.Filename, iDiff*100)
item.MatchFactor = dSame
if dSame >= t {
matchlist = append(matchlist, item)
}
}
fmt.Print("\r")
lastPc = pc
}
}
return matchlist
}
func Aggregate(f func(d *Disk, collector interface{}), collector interface{}, pathfilter []string) {
l := loggy.Get(0)
exists, matches := existsPattern(*baseName, pathfilter, "*_*_*_*.fgp")
if !exists {
return
}
var lastPc int = -1
for i, m := range matches {
pc := int(100 * float64(i) / float64(len(matches)))
if pc != lastPc {
os.Stderr.WriteString(fmt.Sprintf("\rAggregating data... %d%% ", pc))
}
l.Logf(":: Checking %s", m)
//item := &Disk{}
if item, err := cache.Get(m); err == nil {
f(item, collector)
}
}
os.Stderr.WriteString("Done.\n")
return
}
func (d *Disk) CompareFiles(b *Disk) float64 {
var sameFiles float64
var missingFiles float64
var extraFiles float64
var dmap = d.GetFileMap()
var bmap = b.GetFileMap()
for fileCk, info := range dmap {
if info.Size == 0 {
continue
}
binfo, bEx := bmap[fileCk]
if bEx {
sameFiles += 1
// file match
if b.MatchFiles == nil {
b.MatchFiles = make(map[*DiskFile]*DiskFile)
}
//fmt.Printf("*** %s: %s -> %s\n", b.Filename, binfo.Filename, info.Filename)
b.MatchFiles[binfo] = info
} else {
missingFiles += 1
// file match
if b.MissingFiles == nil {
b.MissingFiles = make([]*DiskFile, 0)
}
//fmt.Printf("*** %s: %s -> %s\n", b.Filename, binfo.Filename, info.Filename)
b.MissingFiles = append(b.MissingFiles, info)
}
}
for fileCk, info := range bmap {
if info.Size == 0 {
continue
}
_, dEx := dmap[fileCk]
if !dEx {
extraFiles += 1
// file match
if b.ExtraFiles == nil {
b.ExtraFiles = make([]*DiskFile, 0)
}
//fmt.Printf("*** %s: %s -> %s\n", b.Filename, binfo.Filename, info.Filename)
b.ExtraFiles = append(b.ExtraFiles, info)
}
}
// return sameSectors / dTotal, sameSectors / bTotal, diffSectors / dTotal, diffSectors / btotal
return sameFiles / (sameFiles + extraFiles + missingFiles)
}
func (d *Disk) GetPartialFileMatchesWithThreshold(t float64, filter []string) []*Disk {
l := loggy.Get(0)
var matchlist []*Disk = make([]*Disk, 0)
exists, matches := existsPattern(*baseName, filter, "*_*_*_*.fgp")
if !exists {
return matchlist
}
var lastPc int = -1
for i, m := range matches {
//item := &Disk{}
if item, err := cache.Get(m); err == nil {
pc := int(100 * float64(i) / float64(len(matches)))
if pc != lastPc {
os.Stderr.WriteString(fmt.Sprintf("Analyzing volumes... %d%% ", pc))
}
if item.FullPath != d.FullPath {
// only here if not looking at same disk
l.Logf(":: Checking overlapping files %s", item.Filename)
dSame := d.CompareFiles(item)
item.MatchFactor = dSame
if dSame >= t {
matchlist = append(matchlist, item)
}
}
fmt.Print("\r")
lastPc = pc
}
}
return matchlist
}
func (d *Disk) HasFileSHA256(sha string) (bool, *DiskFile) {
for _, file := range d.Files {
if sha == file.SHA256 {
return true, file
}
}
return false, nil
}
func (d *Disk) GetFileChecksum(filename string) (bool, string) {
for _, f := range d.Files {
if strings.ToLower(filename) == strings.ToLower(f.Filename) {
return true, f.SHA256
}
}
return false, ""
}
func (d *Disk) GetFileMatches(filename string, filter []string) []*Disk {
l := loggy.Get(0)
var matchlist []*Disk = make([]*Disk, 0)
exists, matches := existsPattern(*baseName, filter, "*_*_*_*.fgp")
if !exists {
return matchlist
}
fileexists, SHA256 := d.GetFileChecksum(filename)
if !fileexists {
os.Stderr.WriteString("File does not exist on this volume: " + filename + "\n")
return matchlist
}
_, srcFile := d.HasFileSHA256(SHA256)
var lastPc int = -1
for i, m := range matches {
//item := &Disk{}
if item, err := cache.Get(m); err == nil {
pc := int(100 * float64(i) / float64(len(matches)))
if pc != lastPc {
os.Stderr.WriteString(fmt.Sprintf("Analyzing volumes... %d%% ", pc))
}
if item.FullPath != d.FullPath {
// only here if not looking at same disk
l.Logf(":: Checking overlapping files %s", item.Filename)
if ex, file := item.HasFileSHA256(SHA256); ex {
if item.MatchFiles == nil {
item.MatchFiles = make(map[*DiskFile]*DiskFile)
}
item.MatchFiles[srcFile] = file
matchlist = append(matchlist, item)
}
}
fmt.Print("\r")
lastPc = pc
}
}
return matchlist
}
// Gets directory with custom format
func (d *Disk) GetDirectory(format string) string {
out := ""
for _, file := range d.Files {
tmp := format
// size
tmp = strings.Replace(tmp, "{size:blocks}", fmt.Sprintf("%3d Blocks", file.Size/256+1), -1)
tmp = strings.Replace(tmp, "{size:kb}", fmt.Sprintf("%4d Kb", file.Size/1024+1), -1)
tmp = strings.Replace(tmp, "{size:b}", fmt.Sprintf("%6d Bytes", file.Size), -1)
tmp = strings.Replace(tmp, "{size}", fmt.Sprintf("%6d", file.Size), -1)
// format
tmp = strings.Replace(tmp, "{filename}", fmt.Sprintf("%-20s", file.Filename), -1)
// type
tmp = strings.Replace(tmp, "{type}", fmt.Sprintf("%-20s", file.Type), -1)
// sha256
tmp = strings.Replace(tmp, "{sha256}", file.SHA256, -1)
// loadaddress
tmp = strings.Replace(tmp, "{loadaddr}", fmt.Sprintf("0x.%4X", file.LoadAddress), -1)
out += tmp + "\n"
}
return out
}
type CacheContext int
const (
CC_All CacheContext = iota
CC_ActiveSectors
CC_AllSectors
CC_Files
)
type DiskMetaDataCache struct {
ctx CacheContext
Disks map[string]*Disk
}
var cache = NewCache(CC_All, "")
func (c *DiskMetaDataCache) Get(filename string) (*Disk, error) {
cached, ok := c.Disks[filename]
if ok {
return cached, nil
}
item := &Disk{}
if err := item.ReadFromFile(filename); err == nil {
c.Disks[filename] = item
return item, nil
}
return nil, errors.New("Not found")
}
func (c *DiskMetaDataCache) Put(filename string, item *Disk) {
c.Disks[filename] = item
}
func NewCache(ctx CacheContext, pattern string) *DiskMetaDataCache {
cache := &DiskMetaDataCache{
ctx: ctx,
Disks: make(map[string]*Disk),
}
return cache
}
func CreateCache(ctx CacheContext, pattern string, filter []string) *DiskMetaDataCache {
cache := &DiskMetaDataCache{
ctx: ctx,
Disks: make(map[string]*Disk),
}
exists, matches := existsPattern(*baseName, filter, pattern)
if !exists {
return cache
}
var lastPc int = -1
for i, m := range matches {
item := &Disk{}
if err := item.ReadFromFile(m); err == nil {
pc := int(100 * float64(i) / float64(len(matches)))
if pc != lastPc {
os.Stderr.WriteString(fmt.Sprintf("Caching data... %d%% ", pc))
}
// Load cache
cache.Put(m, item)
fmt.Print("\r")
lastPc = pc
}
}
return cache
}
func SearchPartialFileMatchesWithThreshold(t float64, filter []string) map[string][2]*Disk {
l := loggy.Get(0)
matchlist := make(map[string][2]*Disk)
exists, matches := existsPattern(*baseName, filter, "*_*_*_*.fgp")
if !exists {
return matchlist
}
done := make(map[string]bool)
var lastPc int = -1
for i, m := range matches {
//item := &Disk{}
if disk, err := cache.Get(m); err == nil {
d := *disk
pc := int(100 * float64(i) / float64(len(matches)))
if pc != lastPc {
os.Stderr.WriteString(fmt.Sprintf("Analyzing volumes... %d%% ", pc))
}
for _, n := range matches {
if jj, err := cache.Get(n); err == nil {
item := *jj
key := d.SHA256 + ":" + item.SHA256
if item.SHA256 < d.SHA256 {
key = item.SHA256 + ":" + d.SHA256
}
if _, ok := done[key]; ok {
continue
}
if item.FullPath != d.FullPath {
// only here if not looking at same disk
l.Logf(":: Checking overlapping files %s", item.Filename)
dSame := d.CompareFiles(&item)
item.MatchFactor = dSame
if dSame >= t {
matchlist[key] = [2]*Disk{&d, &item}
}
}
done[key] = true
}
}
fmt.Print("\r")
lastPc = pc
}
}
return matchlist
}
const ingestWorkers = 4
const processWorkers = 6
func exists(path string) bool {
_, err := os.Stat(path)
if err != nil {
return false
}
return true
}