diskm8/report.go
April Ayres-Griffiths 75d42c4597 initial commit
2018-01-20 11:05:04 +11:00

704 lines
15 KiB
Go

package main
import (
"fmt"
"os"
"sort"
)
type DuplicateSource struct {
Fullpath string
Filename string
GSHA string
fingerprint string
}
type DuplicateFileCollection struct {
data map[string][]DuplicateSource
}
type DuplicateWholeDiskCollection struct {
data map[string][]DuplicateSource
}
type DuplicateActiveSectorDiskCollection struct {
data map[string][]DuplicateSource
data_as map[string][]DuplicateSource
}
func (dfc *DuplicateFileCollection) Add(checksum string, fullpath string, filename string, fgp string) {
if dfc.data == nil {
dfc.data = make(map[string][]DuplicateSource)
}
list, ok := dfc.data[checksum]
if !ok {
list = make([]DuplicateSource, 0)
}
list = append(list, DuplicateSource{Fullpath: fullpath, Filename: filename, fingerprint: fgp})
dfc.data[checksum] = list
}
func (dfc *DuplicateWholeDiskCollection) Add(checksum string, fullpath string, fgp string) {
if dfc.data == nil {
dfc.data = make(map[string][]DuplicateSource)
}
list, ok := dfc.data[checksum]
if !ok {
list = make([]DuplicateSource, 0)
}
list = append(list, DuplicateSource{Fullpath: fullpath, fingerprint: fgp})
dfc.data[checksum] = list
}
func (dfc *DuplicateActiveSectorDiskCollection) Add(checksum string, achecksum string, fullpath string, fgp string) {
if dfc.data == nil {
dfc.data = make(map[string][]DuplicateSource)
}
list, ok := dfc.data[achecksum]
if !ok {
list = make([]DuplicateSource, 0)
}
list = append(list, DuplicateSource{Fullpath: fullpath, GSHA: checksum, fingerprint: fgp})
dfc.data[achecksum] = list
}
func (dfc *DuplicateFileCollection) Report(filename string) {
var w *os.File
var err error
if filename != "" {
w, err = os.Create(filename)
if err != nil {
return
}
defer w.Close()
} else {
w = os.Stdout
}
for sha256, list := range dfc.data {
if len(list) > 1 {
w.WriteString(fmt.Sprintf("\nChecksum %s duplicated %d times:\n", sha256, len(list)))
for i, v := range list {
w.WriteString(fmt.Sprintf(" %d) %s >> %s\n", i, v.Fullpath, v.Filename))
}
}
}
}
func AggregateDuplicateFiles(d *Disk, collection interface{}) {
for _, f := range d.Files {
collection.(*DuplicateFileCollection).Add(f.SHA256, d.FullPath, f.Filename, d.source)
}
}
func AggregateDuplicateWholeDisks(d *Disk, collection interface{}) {
collection.(*DuplicateWholeDiskCollection).Add(d.SHA256, d.FullPath, d.source)
}
func AggregateDuplicateActiveSectorDisks(d *Disk, collection interface{}) {
collection.(*DuplicateActiveSectorDiskCollection).Add(d.SHA256, d.SHA256Active, d.FullPath, d.source)
}
func (dfc *DuplicateWholeDiskCollection) Report(filename string) {
var disksWithDupes int
var extras int
var w *os.File
var err error
if filename != "" {
w, err = os.Create(filename)
if err != nil {
return
}
defer w.Close()
} else {
w = os.Stdout
}
for sha256, list := range dfc.data {
if len(list) > 1 {
disksWithDupes++
original := list[0]
dupes := list[1:]
w.WriteString("\n")
w.WriteString(fmt.Sprintf("Volume %s has %d duplicate(s):\n", original.Fullpath, len(dupes)))
for _, v := range dupes {
w.WriteString(fmt.Sprintf(" %s (sha256: %s)\n", v.Fullpath, sha256))
extras++
}
}
}
w.WriteString("\n")
w.WriteString("SUMMARY\n")
w.WriteString("=======\n")
w.WriteString(fmt.Sprintf("Total disks which have duplicates: %d\n", disksWithDupes))
w.WriteString(fmt.Sprintf("Total redundant copies found : %d\n", extras))
}
func (dfc *DuplicateActiveSectorDiskCollection) Report(filename string) {
var disksWithDupes int
var extras int
var w *os.File
var err error
if filename != "" {
w, err = os.Create(filename)
if err != nil {
return
}
defer w.Close()
} else {
w = os.Stdout
}
for sha256, list := range dfc.data {
if len(list) > 1 {
m := make(map[string]int)
for _, v := range list {
m[v.GSHA] = 1
}
if len(m) == 1 {
continue
}
disksWithDupes++
original := list[0]
dupes := list[1:]
w.WriteString("\n")
w.WriteString("--------------------------------------\n")
w.WriteString(fmt.Sprintf("Volume : %s\n", original.Fullpath))
w.WriteString(fmt.Sprintf("Active SHA256: %s\n", sha256))
w.WriteString(fmt.Sprintf("Global SHA256: %s\n", original.GSHA))
w.WriteString(fmt.Sprintf("# Duplicates : %d\n", len(dupes)))
for i, v := range dupes {
w.WriteString("\n")
w.WriteString(fmt.Sprintf(" Duplicate #%d\n", i+1))
w.WriteString(fmt.Sprintf(" = Volume : %s\n", v.Fullpath))
w.WriteString(fmt.Sprintf(" = Active SHA256: %s\n", sha256))
w.WriteString(fmt.Sprintf(" = Global SHA256: %s\n", v.GSHA))
extras++
}
w.WriteString("\n")
}
}
w.WriteString("\n")
w.WriteString("SUMMARY\n")
w.WriteString("=======\n")
w.WriteString(fmt.Sprintf("Total disks which have duplicates: %d\n", disksWithDupes))
w.WriteString(fmt.Sprintf("Total redundant copies found : %d\n", extras))
}
func asPartialReport(d *Disk, t float64, filename string, pathfilter []string) {
matches := d.GetPartialMatchesWithThreshold(t, pathfilter)
var w *os.File
var err error
if filename != "" {
w, err = os.Create(filename)
if err != nil {
return
}
defer w.Close()
} else {
w = os.Stdout
}
w.WriteString(fmt.Sprintf("PARTIAL ACTIVE SECTOR MATCH REPORT FOR %s (Above %.2f%%)\n\n", d.Filename, 100*t))
//sort.Sort(ByMatchFactor(matches))
sort.Sort(ByMatchFactor(matches))
w.WriteString(fmt.Sprintf("%d matches found\n\n", len(matches)))
for i := len(matches) - 1; i >= 0; i-- {
v := matches[i]
w.WriteString(fmt.Sprintf("%.2f%%\t%s\n", v.MatchFactor*100, v.FullPath))
}
w.WriteString("")
}
func filePartialReport(d *Disk, t float64, filename string, pathfilter []string) {
matches := d.GetPartialFileMatchesWithThreshold(t, pathfilter)
var w *os.File
var err error
if filename != "" {
w, err = os.Create(filename)
if err != nil {
return
}
defer w.Close()
} else {
w = os.Stdout
}
w.WriteString(fmt.Sprintf("PARTIAL FILE MATCH REPORT FOR %s (Above %.2f%%)\n\n", d.Filename, 100*t))
//sort.Sort(ByMatchFactor(matches))
sort.Sort(ByMatchFactor(matches))
w.WriteString(fmt.Sprintf("%d matches found\n\n", len(matches)))
for i := len(matches) - 1; i >= 0; i-- {
v := matches[i]
w.WriteString(fmt.Sprintf("%.2f%%\t%s (%d missing, %d extras)\n", v.MatchFactor*100, v.FullPath, len(v.MissingFiles), len(v.ExtraFiles)))
for f1, f2 := range v.MatchFiles {
w.WriteString(fmt.Sprintf("\t == %s -> %s\n", f1.Filename, f2.Filename))
}
for _, f := range v.MissingFiles {
w.WriteString(fmt.Sprintf("\t -- %s\n", f.Filename))
}
for _, f := range v.ExtraFiles {
w.WriteString(fmt.Sprintf("\t ++ %s\n", f.Filename))
}
w.WriteString("")
}
w.WriteString("")
}
func fileMatchReport(d *Disk, filename string, pathfilter []string) {
matches := d.GetFileMatches(filename, pathfilter)
var w *os.File
var err error
if filename != "" {
w, err = os.Create(filename)
if err != nil {
return
}
defer w.Close()
} else {
w = os.Stdout
}
w.WriteString(fmt.Sprintf("PARTIAL FILE MATCH REPORT FOR %s (File: %s)\n\n", d.Filename, filename))
w.WriteString(fmt.Sprintf("%d matches found\n\n", len(matches)))
for i, v := range matches {
w.WriteString(fmt.Sprintf("%d)\t%s\n", i, v.FullPath))
for f1, f2 := range v.MatchFiles {
w.WriteString(fmt.Sprintf("\t == %s -> %s\n", f1.Filename, f2.Filename))
}
w.WriteString("")
}
w.WriteString("")
}
func fileDupeReport(filter []string) {
dfc := &DuplicateFileCollection{}
Aggregate(AggregateDuplicateFiles, dfc, filter)
fmt.Println("DUPLICATE FILE REPORT")
fmt.Println()
dfc.Report(*reportFile)
}
func wholeDupeReport(filter []string) {
dfc := &DuplicateWholeDiskCollection{}
Aggregate(AggregateDuplicateWholeDisks, dfc, filter)
fmt.Println("DUPLICATE WHOLE DISK REPORT")
fmt.Println()
dfc.Report(*reportFile)
}
func activeDupeReport(filter []string) {
dfc := &DuplicateActiveSectorDiskCollection{}
Aggregate(AggregateDuplicateActiveSectorDisks, dfc, filter)
fmt.Println("DUPLICATE ACTIVE SECTORS DISK REPORT")
fmt.Println()
dfc.Report(*reportFile)
}
func allFilesPartialReport(t float64, filter []string, oheading string) {
matches := CollectFilesOverlapsAboveThreshold(t, filter)
if *csvOut {
dumpFileOverlapCSV(matches, *reportFile)
return
}
if oheading != "" {
fmt.Println(oheading + "\n")
} else {
fmt.Printf("PARTIAL ALL FILE MATCH REPORT (Above %.2f%%)\n\n", 100*t)
}
fmt.Printf("%d matches found\n\n", len(matches))
for volumename, matchdata := range matches {
fmt.Printf("Disk: %s\n", volumename)
for k, ratio := range matchdata.percent {
fmt.Println()
fmt.Printf(" :: %.2f%% Match to %s\n", 100*ratio, k)
for f1, f2 := range matchdata.files[k] {
fmt.Printf(" == %s -> %s\n", f1.Filename, f2.Filename)
}
for _, f := range matchdata.missing[k] {
fmt.Printf(" -- %s\n", f.Filename)
}
for _, f := range matchdata.extras[k] {
fmt.Printf(" ++ %s\n", f.Filename)
}
fmt.Println()
}
fmt.Println()
}
fmt.Println()
}
func allSectorsPartialReport(t float64, filter []string) {
matches := CollectSectorOverlapsAboveThreshold(t, filter, GetAllDiskSectors)
if *csvOut {
dumpSectorOverlapCSV(matches, *reportFile)
return
}
fmt.Printf("NON-ZERO SECTOR MATCH REPORT (Above %.2f%%)\n\n", 100*t)
fmt.Printf("%d matches found\n\n", len(matches))
for volumename, matchdata := range matches {
fmt.Printf("Disk: %s\n", volumename)
for k, ratio := range matchdata.percent {
fmt.Println()
fmt.Printf(" :: %.2f%% Match to %s\n", 100*ratio, k)
fmt.Printf(" == %d Sectors matched\n", len(matchdata.same[k]))
fmt.Printf(" -- %d Sectors missing\n", len(matchdata.missing[k]))
fmt.Printf(" ++ %d Sectors extra\n", len(matchdata.extras[k]))
fmt.Println()
}
fmt.Println()
}
fmt.Println()
}
func activeSectorsPartialReport(t float64, filter []string) {
matches := CollectSectorOverlapsAboveThreshold(t, filter, GetActiveDiskSectors)
if *csvOut {
dumpSectorOverlapCSV(matches, *reportFile)
return
}
fmt.Printf("PARTIAL ACTIVE SECTOR MATCH REPORT (Above %.2f%%)\n\n", 100*t)
fmt.Printf("%d matches found\n\n", len(matches))
for volumename, matchdata := range matches {
fmt.Printf("Disk: %s\n", volumename)
for k, ratio := range matchdata.percent {
fmt.Println()
fmt.Printf(" :: %.2f%% Match to %s\n", 100*ratio, k)
fmt.Printf(" == %d Sectors matched\n", len(matchdata.same[k]))
fmt.Printf(" -- %d Sectors missing\n", len(matchdata.missing[k]))
fmt.Printf(" ++ %d Sectors extra\n", len(matchdata.extras[k]))
fmt.Println()
}
fmt.Println()
}
fmt.Println()
}
func allFilesSubsetReport(filter []string) {
matches := CollectFileSubsets(filter)
if *csvOut {
dumpFileOverlapCSV(matches, *reportFile)
return
}
fmt.Printf("SUBSET DISK FILE MATCH REPORT\n\n")
fmt.Printf("%d matches found\n\n", len(matches))
for volumename, matchdata := range matches {
fmt.Printf("Disk: %s\n", volumename)
for k, _ := range matchdata.percent {
fmt.Println()
fmt.Printf(" :: Is a file subset of %s\n", k)
for f1, f2 := range matchdata.files[k] {
fmt.Printf(" == %s -> %s\n", f1.Filename, f2.Filename)
}
for _, f := range matchdata.missing[k] {
fmt.Printf(" -- %s\n", f.Filename)
}
for _, f := range matchdata.extras[k] {
fmt.Printf(" ++ %s\n", f.Filename)
}
fmt.Println()
}
fmt.Println()
}
fmt.Println()
}
func activeSectorsSubsetReport(filter []string) {
matches := CollectSectorSubsets(filter, GetActiveDiskSectors)
if *csvOut {
dumpSectorOverlapCSV(matches, *reportFile)
return
}
fmt.Printf("ACTIVE SECTOR SUBSET MATCH REPORT\n\n")
fmt.Printf("%d matches found\n\n", len(matches))
for volumename, matchdata := range matches {
fmt.Printf("Disk: %s\n", volumename)
for k, _ := range matchdata.percent {
fmt.Println()
fmt.Printf(" :: Is a subset (based on active sectors) of %s\n", k)
fmt.Printf(" == %d Sectors matched\n", len(matchdata.same[k]))
fmt.Printf(" ++ %d Sectors extra\n", len(matchdata.extras[k]))
fmt.Println()
}
fmt.Println()
}
fmt.Println()
}
func allSectorsSubsetReport(filter []string) {
matches := CollectSectorSubsets(filter, GetAllDiskSectors)
if *csvOut {
dumpSectorOverlapCSV(matches, *reportFile)
return
}
fmt.Printf("NON-ZERO SECTOR SUBSET MATCH REPORT\n\n")
fmt.Printf("%d matches found\n\n", len(matches))
for volumename, matchdata := range matches {
fmt.Printf("Disk: %s\n", volumename)
for k, _ := range matchdata.percent {
fmt.Println()
fmt.Printf(" :: Is a subset (based on active sectors) of %s\n", k)
fmt.Printf(" == %d Sectors matched\n", len(matchdata.same[k]))
fmt.Printf(" ++ %d Sectors extra\n", len(matchdata.extras[k]))
fmt.Println()
}
fmt.Println()
}
fmt.Println()
}
func dumpFileOverlapCSV(matches map[string]*FileOverlapRecord, filename string) {
var w *os.File
var err error
if filename != "" {
w, err = os.Create(filename)
if err != nil {
return
}
defer w.Close()
} else {
w = os.Stderr
}
w.WriteString("MATCH,DISK1,FILENAME1,DISK2,FILENAME2,EXISTS\n")
for disk1, matchdata := range matches {
for disk2, match := range matchdata.percent {
for f1, f2 := range matchdata.files[disk2] {
w.WriteString(fmt.Sprintf(`%.2f,"%s","%s","%s","%s",%s`, match, disk1, f1.Filename, disk2, f2.Filename, "Y") + "\n")
}
for _, f1 := range matchdata.missing[disk2] {
w.WriteString(fmt.Sprintf(`%.2f,"%s","%s","%s","%s",%s`, match, disk1, f1.Filename, disk2, "", "N") + "\n")
}
for _, f2 := range matchdata.extras[disk2] {
w.WriteString(fmt.Sprintf(`%.2f,"%s","%s","%s","%s",%s`, match, disk1, "", disk2, f2.Filename, "N") + "\n")
}
}
}
if filename != "" {
fmt.Println("\nWrote " + filename + "\n")
}
}
func dumpSectorOverlapCSV(matches map[string]*SectorOverlapRecord, filename string) {
var w *os.File
var err error
if filename != "" {
w, err = os.Create(filename)
if err != nil {
return
}
defer w.Close()
} else {
w = os.Stderr
}
w.WriteString("MATCH,DISK1,DISK2,SAME,MISSING,EXTRA\n")
for disk1, matchdata := range matches {
for disk2, match := range matchdata.percent {
w.WriteString(fmt.Sprintf(`%.2f,"%s","%s",%d,%d,%d`, match, disk1, disk2, len(matchdata.same[disk2]), len(matchdata.missing[disk2]), len(matchdata.extras[disk2])) + "\n")
}
}
if filename != "" {
fmt.Println("\nWrote " + filename + "\n")
}
}
func keeperAtLeastNSame(d1, d2 string, v *FileOverlapRecord) bool {
return len(v.files[d2]) >= *minSame
}
func keeperMaximumNDiff(d1, d2 string, v *FileOverlapRecord) bool {
return len(v.files[d2]) > 0 && (len(v.missing[d2])+len(v.extras[d2])) <= *maxDiff
}
func allFilesCustomReport(keep func(d1, d2 string, v *FileOverlapRecord) bool, filter []string, oheading string) {
matches := CollectFilesOverlapsCustom(keep, filter)
if *csvOut {
dumpFileOverlapCSV(matches, *reportFile)
return
}
fmt.Println(oheading + "\n")
fmt.Printf("%d matches found\n\n", len(matches))
for volumename, matchdata := range matches {
fmt.Printf("Disk: %s\n", volumename)
for k, ratio := range matchdata.percent {
fmt.Println()
fmt.Printf(" :: %.2f%% Match to %s\n", 100*ratio, k)
for f1, f2 := range matchdata.files[k] {
fmt.Printf(" == %s -> %s\n", f1.Filename, f2.Filename)
}
for _, f := range matchdata.missing[k] {
fmt.Printf(" -- %s\n", f.Filename)
}
for _, f := range matchdata.extras[k] {
fmt.Printf(" ++ %s\n", f.Filename)
}
fmt.Println()
}
fmt.Println()
}
fmt.Println()
}