mirror of
				https://github.com/paleotronic/diskm8.git
				synced 2025-10-25 10:19:24 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			468 lines
		
	
	
		
			8.9 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			468 lines
		
	
	
		
			8.9 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
| package main
 | |
| 
 | |
| import (
 | |
| 	"fmt"
 | |
| 	"os"
 | |
| 	"os/signal"
 | |
| 	"strings"
 | |
| 	"sync"
 | |
| )
 | |
| 
 | |
| func inList(item string, list []string) bool {
 | |
| 	for _, v := range list {
 | |
| 		if strings.ToLower(v) == strings.ToLower(item) {
 | |
| 			return true
 | |
| 		}
 | |
| 	}
 | |
| 	return false
 | |
| }
 | |
| 
 | |
| const EXCLUDEZEROBYTE = true
 | |
| const EXCLUDEHELLO = true
 | |
| 
 | |
| func GetAllFiles(pattern string, pathfilter []string) map[string]DiskCatalog {
 | |
| 
 | |
| 	cache := make(map[string]DiskCatalog)
 | |
| 
 | |
| 	exists, matches := existsPattern(*baseName, pathfilter, pattern)
 | |
| 	if !exists {
 | |
| 		return cache
 | |
| 	}
 | |
| 
 | |
| 	workchan := make(chan string, 100)
 | |
| 	var s sync.Mutex
 | |
| 	var wg sync.WaitGroup
 | |
| 
 | |
| 	for i := 0; i < ingestWorkers; i++ {
 | |
| 		wg.Add(1)
 | |
| 		go func() {
 | |
| 			for m := range workchan {
 | |
| 				item := &Disk{}
 | |
| 				if err := item.ReadFromFile(m); err == nil {
 | |
| 
 | |
| 					if len(item.Files) == 0 {
 | |
| 						continue
 | |
| 					}
 | |
| 
 | |
| 					// Load cache
 | |
| 					s.Lock()
 | |
| 					cache[item.FullPath] = item.Files
 | |
| 					s.Unlock()
 | |
| 
 | |
| 				} else {
 | |
| 					fmt.Println("FAIL")
 | |
| 				}
 | |
| 			}
 | |
| 			wg.Done()
 | |
| 		}()
 | |
| 	}
 | |
| 
 | |
| 	var lastPc int = -1
 | |
| 	for i, m := range matches {
 | |
| 
 | |
| 		//fmt.Printf("Queue: %s\n", m)
 | |
| 
 | |
| 		workchan <- m
 | |
| 
 | |
| 		pc := int(100 * float64(i) / float64(len(matches)))
 | |
| 
 | |
| 		if pc != lastPc {
 | |
| 			fmt.Print("\r")
 | |
| 			os.Stderr.WriteString(fmt.Sprintf("Caching data... %d%%   ", pc))
 | |
| 		}
 | |
| 
 | |
| 		lastPc = pc
 | |
| 	}
 | |
| 	close(workchan)
 | |
| 
 | |
| 	wg.Wait()
 | |
| 
 | |
| 	return cache
 | |
| }
 | |
| 
 | |
| type FileOverlapRecord struct {
 | |
| 	files   map[string]map[*DiskFile]*DiskFile
 | |
| 	percent map[string]float64
 | |
| 	missing map[string][]*DiskFile
 | |
| 	extras  map[string][]*DiskFile
 | |
| }
 | |
| 
 | |
| func (f *FileOverlapRecord) Remove(key string) {
 | |
| 	delete(f.files, key)
 | |
| 	delete(f.percent, key)
 | |
| 	delete(f.missing, key)
 | |
| 	delete(f.extras, key)
 | |
| }
 | |
| 
 | |
| func (f *FileOverlapRecord) IsSubsetOf(filename string) bool {
 | |
| 
 | |
| 	// f is a subset if:
 | |
| 	// missing == 0
 | |
| 	// extra > 0
 | |
| 
 | |
| 	if _, ok := f.files[filename]; !ok {
 | |
| 		return false
 | |
| 	}
 | |
| 
 | |
| 	return len(f.extras[filename]) > 0 && len(f.missing[filename]) == 0
 | |
| 
 | |
| }
 | |
| 
 | |
| func (f *FileOverlapRecord) IsSupersetOf(filename string) bool {
 | |
| 
 | |
| 	// f is a superset if:
 | |
| 	// missing > 0
 | |
| 	// extra == 0
 | |
| 
 | |
| 	if _, ok := f.files[filename]; !ok {
 | |
| 		return false
 | |
| 	}
 | |
| 
 | |
| 	return len(f.extras[filename]) == 0 && len(f.missing[filename]) > 0
 | |
| 
 | |
| }
 | |
| 
 | |
| // Actual fuzzy file match report
 | |
| func CollectFilesOverlapsAboveThreshold(t float64, pathfilter []string) map[string]*FileOverlapRecord {
 | |
| 
 | |
| 	filerecords := GetAllFiles("*_*_*_*.fgp", pathfilter)
 | |
| 
 | |
| 	results := make(map[string]*FileOverlapRecord)
 | |
| 
 | |
| 	workchan := make(chan string, 100)
 | |
| 	var wg sync.WaitGroup
 | |
| 	var s sync.Mutex
 | |
| 
 | |
| 	c := make(chan os.Signal, 1)
 | |
| 	signal.Notify(c, os.Interrupt)
 | |
| 
 | |
| 	for i := 0; i < processWorkers; i++ {
 | |
| 		wg.Add(1)
 | |
| 		go func() {
 | |
| 			for m := range workchan {
 | |
| 
 | |
| 				v := &FileOverlapRecord{
 | |
| 					files:   make(map[string]map[*DiskFile]*DiskFile),
 | |
| 					percent: make(map[string]float64),
 | |
| 					missing: make(map[string][]*DiskFile),
 | |
| 					extras:  make(map[string][]*DiskFile),
 | |
| 				}
 | |
| 
 | |
| 				d := filerecords[m]
 | |
| 
 | |
| 				for k, b := range filerecords {
 | |
| 					if k == m {
 | |
| 						continue // dont compare ourselves
 | |
| 					}
 | |
| 					// ok good to compare -- only keep if we need our threshold
 | |
| 
 | |
| 					if closeness := CompareCatalogs(d, b, v, k); closeness < t {
 | |
| 						v.Remove(k)
 | |
| 					} else {
 | |
| 						v.percent[k] = closeness
 | |
| 					}
 | |
| 				}
 | |
| 
 | |
| 				// since we delete < threshold, only add if we have any result
 | |
| 				if len(v.percent) > 0 {
 | |
| 					//os.Stderr.WriteString("\r\nAdded file: " + m + "\r\n\r\n")
 | |
| 					s.Lock()
 | |
| 					results[m] = v
 | |
| 					s.Unlock()
 | |
| 				}
 | |
| 
 | |
| 			}
 | |
| 			wg.Done()
 | |
| 		}()
 | |
| 	}
 | |
| 
 | |
| 	// feed data in
 | |
| 	var lastPc int = -1
 | |
| 	var i int
 | |
| 	for k, _ := range filerecords {
 | |
| 
 | |
| 		if len(c) > 0 {
 | |
| 			sig := <-c
 | |
| 			if sig == os.Interrupt {
 | |
| 				close(c)
 | |
| 				os.Stderr.WriteString("\r\nInterrupted. Waiting for workers to stop.\r\n\r\n")
 | |
| 				break
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		workchan <- k
 | |
| 
 | |
| 		pc := int(100 * float64(i) / float64(len(filerecords)))
 | |
| 
 | |
| 		if pc != lastPc {
 | |
| 			fmt.Print("\r")
 | |
| 			os.Stderr.WriteString(fmt.Sprintf("Processing files data... %d%%   ", pc))
 | |
| 		}
 | |
| 
 | |
| 		lastPc = pc
 | |
| 		i++
 | |
| 	}
 | |
| 
 | |
| 	close(workchan)
 | |
| 	wg.Wait()
 | |
| 
 | |
| 	return results
 | |
| 
 | |
| }
 | |
| 
 | |
| func GetCatalogMap(d DiskCatalog) map[string]*DiskFile {
 | |
| 
 | |
| 	out := make(map[string]*DiskFile)
 | |
| 	for _, v := range d {
 | |
| 		out[v.SHA256] = v
 | |
| 	}
 | |
| 	return out
 | |
| 
 | |
| }
 | |
| 
 | |
| func CompareCatalogs(d, b DiskCatalog, r *FileOverlapRecord, key string) float64 {
 | |
| 
 | |
| 	var sameFiles float64
 | |
| 	var missingFiles float64
 | |
| 	var extraFiles float64
 | |
| 
 | |
| 	var dmap = GetCatalogMap(d)
 | |
| 	var bmap = GetCatalogMap(b)
 | |
| 
 | |
| 	for fileCk, info := range dmap {
 | |
| 
 | |
| 		if info.Size == 0 && EXCLUDEZEROBYTE {
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		if info.Filename == "hello" && EXCLUDEHELLO {
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		binfo, bEx := bmap[fileCk]
 | |
| 
 | |
| 		if bEx {
 | |
| 			sameFiles += 1
 | |
| 			// file match
 | |
| 			if r.files[key] == nil {
 | |
| 				r.files[key] = make(map[*DiskFile]*DiskFile)
 | |
| 			}
 | |
| 			//fmt.Printf("*** %s: %s -> %s\n", b.Filename, binfo.Filename, info.Filename)
 | |
| 			r.files[key][binfo] = info
 | |
| 		} else {
 | |
| 			missingFiles += 1
 | |
| 			// file match
 | |
| 			if r.missing[key] == nil {
 | |
| 				r.missing[key] = make([]*DiskFile, 0)
 | |
| 			}
 | |
| 			//fmt.Printf("*** %s: %s -> %s\n", b.Filename, binfo.Filename, info.Filename)
 | |
| 			r.missing[key] = append(r.missing[key], info)
 | |
| 		}
 | |
| 
 | |
| 	}
 | |
| 
 | |
| 	for fileCk, info := range bmap {
 | |
| 
 | |
| 		if info.Size == 0 {
 | |
| 			continue
 | |
| 		}
 | |
| 
 | |
| 		_, dEx := dmap[fileCk]
 | |
| 
 | |
| 		if !dEx {
 | |
| 			extraFiles += 1
 | |
| 			// file match
 | |
| 			if r.extras[key] == nil {
 | |
| 				r.extras[key] = make([]*DiskFile, 0)
 | |
| 			}
 | |
| 			//fmt.Printf("*** %s: %s -> %s\n", b.Filename, binfo.Filename, info.Filename)
 | |
| 			r.extras[key] = append(r.extras[key], info)
 | |
| 		}
 | |
| 
 | |
| 	}
 | |
| 
 | |
| 	if (sameFiles + extraFiles + missingFiles) == 0 {
 | |
| 		return 0
 | |
| 	}
 | |
| 
 | |
| 	// return sameSectors / dTotal, sameSectors / bTotal, diffSectors / dTotal, diffSectors / btotal
 | |
| 	return sameFiles / (sameFiles + extraFiles + missingFiles)
 | |
| 
 | |
| }
 | |
| 
 | |
| func CollectFileSubsets(pathfilter []string) map[string]*FileOverlapRecord {
 | |
| 
 | |
| 	filerecords := GetAllFiles("*_*_*_*.fgp", pathfilter)
 | |
| 
 | |
| 	results := make(map[string]*FileOverlapRecord)
 | |
| 
 | |
| 	workchan := make(chan string, 100)
 | |
| 	var wg sync.WaitGroup
 | |
| 	var s sync.Mutex
 | |
| 
 | |
| 	c := make(chan os.Signal, 1)
 | |
| 	signal.Notify(c, os.Interrupt)
 | |
| 
 | |
| 	for i := 0; i < processWorkers; i++ {
 | |
| 		wg.Add(1)
 | |
| 		go func() {
 | |
| 			for m := range workchan {
 | |
| 
 | |
| 				v := &FileOverlapRecord{
 | |
| 					files:   make(map[string]map[*DiskFile]*DiskFile),
 | |
| 					percent: make(map[string]float64),
 | |
| 					missing: make(map[string][]*DiskFile),
 | |
| 					extras:  make(map[string][]*DiskFile),
 | |
| 				}
 | |
| 
 | |
| 				d := filerecords[m]
 | |
| 
 | |
| 				for k, b := range filerecords {
 | |
| 					if k == m {
 | |
| 						continue // dont compare ourselves
 | |
| 					}
 | |
| 					// ok good to compare -- only keep if we need our threshold
 | |
| 
 | |
| 					closeness := CompareCatalogs(d, b, v, k)
 | |
| 					if !v.IsSubsetOf(k) {
 | |
| 						v.Remove(k)
 | |
| 					} else {
 | |
| 						v.percent[k] = closeness
 | |
| 					}
 | |
| 				}
 | |
| 
 | |
| 				// since we delete < threshold, only add if we have any result
 | |
| 				if len(v.percent) > 0 {
 | |
| 					//os.Stderr.WriteString("\r\nAdded file: " + m + "\r\n\r\n")
 | |
| 					s.Lock()
 | |
| 					results[m] = v
 | |
| 					s.Unlock()
 | |
| 				}
 | |
| 
 | |
| 			}
 | |
| 			wg.Done()
 | |
| 		}()
 | |
| 	}
 | |
| 
 | |
| 	// feed data in
 | |
| 	var lastPc int = -1
 | |
| 	var i int
 | |
| 	for k, _ := range filerecords {
 | |
| 
 | |
| 		if len(c) > 0 {
 | |
| 			sig := <-c
 | |
| 			if sig == os.Interrupt {
 | |
| 				close(c)
 | |
| 				os.Stderr.WriteString("\r\nInterrupted. Waiting for workers to stop.\r\n\r\n")
 | |
| 				break
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		workchan <- k
 | |
| 
 | |
| 		pc := int(100 * float64(i) / float64(len(filerecords)))
 | |
| 
 | |
| 		if pc != lastPc {
 | |
| 			fmt.Print("\r")
 | |
| 			os.Stderr.WriteString(fmt.Sprintf("Processing files data... %d%%   ", pc))
 | |
| 		}
 | |
| 
 | |
| 		lastPc = pc
 | |
| 		i++
 | |
| 	}
 | |
| 
 | |
| 	close(workchan)
 | |
| 	wg.Wait()
 | |
| 
 | |
| 	return results
 | |
| 
 | |
| }
 | |
| 
 | |
| func CollectFilesOverlapsCustom(keep func(d1, d2 string, v *FileOverlapRecord) bool, pathfilter []string) map[string]*FileOverlapRecord {
 | |
| 
 | |
| 	filerecords := GetAllFiles("*_*_*_*.fgp", pathfilter)
 | |
| 
 | |
| 	results := make(map[string]*FileOverlapRecord)
 | |
| 
 | |
| 	workchan := make(chan string, 100)
 | |
| 	var wg sync.WaitGroup
 | |
| 	var s sync.Mutex
 | |
| 
 | |
| 	c := make(chan os.Signal, 1)
 | |
| 	signal.Notify(c, os.Interrupt)
 | |
| 
 | |
| 	for i := 0; i < processWorkers; i++ {
 | |
| 		wg.Add(1)
 | |
| 		go func() {
 | |
| 			for m := range workchan {
 | |
| 
 | |
| 				v := &FileOverlapRecord{
 | |
| 					files:   make(map[string]map[*DiskFile]*DiskFile),
 | |
| 					percent: make(map[string]float64),
 | |
| 					missing: make(map[string][]*DiskFile),
 | |
| 					extras:  make(map[string][]*DiskFile),
 | |
| 				}
 | |
| 
 | |
| 				d := filerecords[m]
 | |
| 
 | |
| 				for k, b := range filerecords {
 | |
| 					if k == m {
 | |
| 						continue // dont compare ourselves
 | |
| 					}
 | |
| 					// ok good to compare -- only keep if we need our threshold
 | |
| 					closeness := CompareCatalogs(d, b, v, k)
 | |
| 
 | |
| 					if !keep(m, k, v) {
 | |
| 						v.Remove(k)
 | |
| 					} else {
 | |
| 						v.percent[k] = closeness
 | |
| 					}
 | |
| 				}
 | |
| 
 | |
| 				// since we delete < threshold, only add if we have any result
 | |
| 				if len(v.files) > 0 {
 | |
| 					//os.Stderr.WriteString("\r\nAdded file: " + m + "\r\n\r\n")
 | |
| 					s.Lock()
 | |
| 					results[m] = v
 | |
| 					s.Unlock()
 | |
| 				}
 | |
| 
 | |
| 			}
 | |
| 			wg.Done()
 | |
| 		}()
 | |
| 	}
 | |
| 
 | |
| 	// feed data in
 | |
| 	var lastPc int = -1
 | |
| 	var i int
 | |
| 	for k, _ := range filerecords {
 | |
| 
 | |
| 		if len(c) > 0 {
 | |
| 			sig := <-c
 | |
| 			if sig == os.Interrupt {
 | |
| 				close(c)
 | |
| 				os.Stderr.WriteString("\r\nInterrupted. Waiting for workers to stop.\r\n\r\n")
 | |
| 				break
 | |
| 			}
 | |
| 		}
 | |
| 
 | |
| 		workchan <- k
 | |
| 
 | |
| 		pc := int(100 * float64(i) / float64(len(filerecords)))
 | |
| 
 | |
| 		if pc != lastPc {
 | |
| 			fmt.Print("\r")
 | |
| 			os.Stderr.WriteString(fmt.Sprintf("Processing files data... %d%%   ", pc))
 | |
| 		}
 | |
| 
 | |
| 		lastPc = pc
 | |
| 		i++
 | |
| 	}
 | |
| 
 | |
| 	close(workchan)
 | |
| 	wg.Wait()
 | |
| 
 | |
| 	return results
 | |
| 
 | |
| }
 |