diskm8/fuzzyblocks.go

411 lines
8.1 KiB
Go
Raw Normal View History

2018-01-20 00:05:04 +00:00
package main
import (
"fmt"
"os"
"os/signal"
"sync"
)
const EMPTYSECTOR = "5341e6b2646979a70e57653007a1f310169421ec9bdd9f1a5648f75ade005af1"
func GetAllDiskSectors(pattern string, pathfilter []string) map[string]DiskSectors {
cache := make(map[string]DiskSectors)
exists, matches := existsPattern(*baseName, pathfilter, pattern)
if !exists {
return cache
}
workchan := make(chan string, 100)
var s sync.Mutex
var wg sync.WaitGroup
for i := 0; i < ingestWorkers; i++ {
wg.Add(1)
go func() {
for m := range workchan {
item := &Disk{}
if err := item.ReadFromFile(m); err == nil {
//
chunk := append(item.ActiveSectors, item.InactiveSectors...)
tmp := make(DiskSectors, 0)
for _, v := range chunk {
if v.SHA256 != EMPTYSECTOR {
tmp = append(tmp, v)
} else {
fmt.Printf("%s: throw away zero sector T%d,S%d\n", item.Filename, v.Track, v.Sector)
}
}
// Load cache
s.Lock()
cache[item.FullPath] = tmp
s.Unlock()
}
}
wg.Done()
}()
}
var lastPc int = -1
for i, m := range matches {
workchan <- m
pc := int(100 * float64(i) / float64(len(matches)))
if pc != lastPc {
fmt.Print("\r")
os.Stderr.WriteString(fmt.Sprintf("Caching disk sector data... %d%% ", pc))
}
lastPc = pc
}
close(workchan)
wg.Wait()
return cache
}
func GetActiveDiskSectors(pattern string, pathfilter []string) map[string]DiskSectors {
cache := make(map[string]DiskSectors)
exists, matches := existsPattern(*baseName, pathfilter, pattern)
if !exists {
return cache
}
workchan := make(chan string, 100)
var s sync.Mutex
var wg sync.WaitGroup
for i := 0; i < ingestWorkers; i++ {
wg.Add(1)
go func() {
for m := range workchan {
item := &Disk{}
if err := item.ReadFromFile(m); err == nil {
// Load cache
s.Lock()
cache[item.FullPath] = item.ActiveSectors
s.Unlock()
}
}
wg.Done()
}()
}
var lastPc int = -1
for i, m := range matches {
workchan <- m
pc := int(100 * float64(i) / float64(len(matches)))
if pc != lastPc {
fmt.Print("\r")
os.Stderr.WriteString(fmt.Sprintf("Caching disk sector data... %d%% ", pc))
}
lastPc = pc
}
close(workchan)
wg.Wait()
return cache
}
func GetSectorMap(d DiskSectors) map[string]*DiskSector {
out := make(map[string]*DiskSector)
for _, v := range d {
out[fmt.Sprintf("T%d,S%d", v.Track, v.Sector)] = v
}
return out
}
type SectorOverlapRecord struct {
same map[string]map[*DiskSector]*DiskSector
percent map[string]float64
missing map[string][]*DiskSector
extras map[string][]*DiskSector
}
func (f *SectorOverlapRecord) Remove(key string) {
delete(f.same, key)
delete(f.percent, key)
delete(f.missing, key)
delete(f.extras, key)
}
func (f *SectorOverlapRecord) IsSubsetOf(filename string) bool {
// f is a subset if:
// missing == 0
// extra > 0
if _, ok := f.same[filename]; !ok {
return false
}
return len(f.extras[filename]) > 0 && len(f.missing[filename]) == 0
}
func (f *SectorOverlapRecord) IsSupersetOf(filename string) bool {
// f is a superset if:
// missing > 0
// extra == 0
if _, ok := f.same[filename]; !ok {
return false
}
return len(f.extras[filename]) == 0 && len(f.missing[filename]) > 0
}
func CompareSectors(d, b DiskSectors, r *SectorOverlapRecord, key string) float64 {
var sameSectors float64
var missingSectors float64
var extraSectors float64
var dmap = GetSectorMap(d)
var bmap = GetSectorMap(b)
for fileCk, info := range dmap {
binfo, bEx := bmap[fileCk]
if bEx && info.SHA256 == binfo.SHA256 {
sameSectors += 1
if r.same[key] == nil {
r.same[key] = make(map[*DiskSector]*DiskSector)
}
r.same[key][binfo] = info
} else {
missingSectors += 1
if r.missing[key] == nil {
r.missing[key] = make([]*DiskSector, 0)
}
r.missing[key] = append(r.missing[key], info)
}
}
for fileCk, info := range bmap {
_, dEx := dmap[fileCk]
if !dEx {
extraSectors += 1
// file match
if r.extras[key] == nil {
r.extras[key] = make([]*DiskSector, 0)
}
//fmt.Printf("*** %s: %s -> %s\n", b.Filename, binfo.Filename, info.Filename)
r.extras[key] = append(r.extras[key], info)
}
}
if (sameSectors + extraSectors + missingSectors) == 0 {
return 0
}
// return sameSectors / dTotal, sameSectors / bTotal, diffSectors / dTotal, diffSectors / btotal
return sameSectors / (sameSectors + extraSectors + missingSectors)
}
// Actual fuzzy file match report
func CollectSectorOverlapsAboveThreshold(t float64, pathfilter []string, ff func(pattern string, pathfilter []string) map[string]DiskSectors) map[string]*SectorOverlapRecord {
filerecords := ff("*_*_*_*.fgp", pathfilter)
results := make(map[string]*SectorOverlapRecord)
workchan := make(chan string, 100)
var wg sync.WaitGroup
var s sync.Mutex
c := make(chan os.Signal, 1)
signal.Notify(c, os.Interrupt)
for i := 0; i < processWorkers; i++ {
wg.Add(1)
go func() {
for m := range workchan {
v := &SectorOverlapRecord{
same: make(map[string]map[*DiskSector]*DiskSector),
percent: make(map[string]float64),
missing: make(map[string][]*DiskSector),
extras: make(map[string][]*DiskSector),
}
d := filerecords[m]
for k, b := range filerecords {
if k == m {
continue // dont compare ourselves
}
// ok good to compare -- only keep if we need our threshold
if closeness := CompareSectors(d, b, v, k); closeness < t {
v.Remove(k)
} else {
v.percent[k] = closeness
}
}
// since we delete < threshold, only add if we have any result
if len(v.percent) > 0 {
//os.Stderr.WriteString("\r\nAdded file: " + m + "\r\n\r\n")
s.Lock()
results[m] = v
s.Unlock()
}
}
wg.Done()
}()
}
// feed data in
var lastPc int = -1
var i int
for k, _ := range filerecords {
if len(c) > 0 {
sig := <-c
if sig == os.Interrupt {
close(c)
os.Stderr.WriteString("\r\nInterrupted. Waiting for workers to stop.\r\n\r\n")
break
}
}
workchan <- k
pc := int(100 * float64(i) / float64(len(filerecords)))
if pc != lastPc {
fmt.Print("\r")
os.Stderr.WriteString(fmt.Sprintf("Processing sectors data... %d%% ", pc))
}
lastPc = pc
i++
}
close(workchan)
wg.Wait()
return results
}
// Actual fuzzy file match report
func CollectSectorSubsets(pathfilter []string, ff func(pattern string, pathfilter []string) map[string]DiskSectors) map[string]*SectorOverlapRecord {
filerecords := ff("*_*_*_*.fgp", pathfilter)
results := make(map[string]*SectorOverlapRecord)
workchan := make(chan string, 100)
var wg sync.WaitGroup
var s sync.Mutex
c := make(chan os.Signal, 1)
signal.Notify(c, os.Interrupt)
for i := 0; i < processWorkers; i++ {
wg.Add(1)
go func() {
for m := range workchan {
v := &SectorOverlapRecord{
same: make(map[string]map[*DiskSector]*DiskSector),
percent: make(map[string]float64),
missing: make(map[string][]*DiskSector),
extras: make(map[string][]*DiskSector),
}
d := filerecords[m]
for k, b := range filerecords {
if k == m {
continue // dont compare ourselves
}
// ok good to compare -- only keep if we need our threshold
closeness := CompareSectors(d, b, v, k)
if !v.IsSubsetOf(k) {
v.Remove(k)
} else {
v.percent[k] = closeness
}
}
// since we delete < threshold, only add if we have any result
if len(v.percent) > 0 {
//os.Stderr.WriteString("\r\nAdded file: " + m + "\r\n\r\n")
s.Lock()
results[m] = v
s.Unlock()
}
}
wg.Done()
}()
}
// feed data in
var lastPc int = -1
var i int
for k, _ := range filerecords {
if len(c) > 0 {
sig := <-c
if sig == os.Interrupt {
close(c)
os.Stderr.WriteString("\r\nInterrupted. Waiting for workers to stop.\r\n\r\n")
break
}
}
workchan <- k
pc := int(100 * float64(i) / float64(len(filerecords)))
if pc != lastPc {
fmt.Print("\r")
os.Stderr.WriteString(fmt.Sprintf("Processing sectors data... %d%% ", pc))
}
lastPc = pc
i++
}
close(workchan)
wg.Wait()
return results
}