123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464 |
- package main
- import (
- "bufio"
- "crypto/sha256"
- "encoding/hex"
- "encoding/json"
- "fmt"
- "io"
- "io/ioutil"
- "log"
- "os"
- "path/filepath"
- "runtime"
- "sort"
- "strings"
- "sync"
- "time"
- "code.cloudfoundry.org/bytefmt"
- flag "github.com/spf13/pflag"
- )
- // Fdhashes struct for holding all informations about one folder.
- type Fdhashes struct {
- Path string
- Hashes map[string]string
- Times map[string]time.Time
- Dirty bool
- }
- var hashes map[string]Fdhashes
- var ignoreLines []string
- var mu sync.RWMutex
- var driveLetter string
- var rewrite bool
- var prune bool
- var outputJson bool
- var report string
- var ignores string
- func init() {
- flag.BoolVarP(&rewrite, "rewrite", "r", false, "rewrite all fhhashes files.")
- flag.StringVarP(&report, "equals", "e", "", "compare all file hashes and writing a equlatity report.")
- flag.BoolVarP(&prune, "prune", "p", false, "checking all fdhashes files.")
- flag.BoolVarP(&outputJson, "json", "j", false, "output as json.")
- flag.StringVarP(&ignores, "ignores", "i", "", "list of files to ignore in report.")
- }
- func main() {
- log.Println("starting GoHash")
- ignoreLines = make([]string, 0)
- hashes = make(map[string]Fdhashes)
- flag.Parse()
- myFile := flag.Arg(0)
- if rewrite {
- log.Println("rewrite active")
- }
- if prune {
- log.Println("prune active")
- }
- if outputJson {
- log.Println("output json format active")
- }
- if report != "" {
- log.Println("report active, file: ", report)
- }
- if ignores != "" {
- log.Println("ignores file: ", ignores)
- }
- file, err := os.Stat(myFile)
- if os.IsNotExist(err) {
- log.Fatalln("File does not exists:", myFile)
- }
- if file.IsDir() {
- log.Println("start with folder:", myFile)
- driveLetter = ""
- if runtime.GOOS == "windows" {
- driveLetter = filepath.VolumeName(myFile) + "/"
- }
- if report != "" {
- compareFolder(myFile)
- } else {
- processFolder(myFile)
- saveAllHashFiles()
- }
- } else {
- log.Printf("file %s has hash %s\n", myFile, getSha256Hash(myFile))
- }
- log.Println("done")
- }
- func getSha256Hash(fileStr string) string {
- f, err := os.Open(fileStr)
- if err != nil {
- log.Fatal(err)
- }
- defer f.Close()
- h := sha256.New()
- if _, err := io.Copy(h, f); err != nil {
- log.Fatal(err)
- }
- return hex.EncodeToString(h.Sum(nil))
- }
- var lock1 = sync.RWMutex{}
- var lock2 = sync.RWMutex{}
- func calculateHash(fileStr string) {
- var hashFile Fdhashes
- doHash := true
- dir, fileName := filepath.Split(fileStr)
- if fileName == ".fdhashes3" {
- return
- }
- // checking if hash is present
- mu.Lock()
- hashFile, ok := hashes[dir]
- if !ok {
- _, err := os.Stat(dir + ".fdhashes3")
- if os.IsNotExist(err) {
- hashFile = Fdhashes{Path: dir, Hashes: make(map[string]string), Times: make(map[string]time.Time), Dirty: true}
- } else {
- hashFile = loadHashfile(dir + ".fdhashes3")
- }
- hashes[dir] = hashFile
- }
- lock1.RLock()
- _, ok = hashFile.Hashes[fileName]
- lock1.RUnlock()
- mu.Unlock()
- doHash = !ok
- // checking if dattime is identically
- file, _ := os.Stat(fileStr)
- time := file.ModTime()
- lock2.RLock()
- savedTime, ok := hashFile.Times[fileName]
- lock2.RUnlock()
- if !time.Equal(savedTime) || !ok {
- doHash = true
- }
- if doHash {
- log.Printf("starting %s\n", fileStr)
- hash := getSha256Hash(fileStr)
- log.Printf("ready %s\n", fileStr)
- mu.Lock()
- lock1.Lock()
- hashFile.Hashes[fileName] = hash
- lock1.Unlock()
- lock2.Lock()
- hashFile.Times[fileName] = time
- lock2.Unlock()
- dirtyHashfile(&hashFile)
- hashes[dir] = hashFile
- mu.Unlock()
- log.Printf("file \"%s\" has hash \"%s\"\n", fileStr, hash)
- }
- }
- var count int
- var addWork int
- var startTime time.Time
- func processFolder(folder string) {
- startTime = time.Now()
- count = 0
- addWork = 0
- err := filepath.Walk(folder, func(path string, info os.FileInfo, err error) error {
- count++
- if (count % 100) == 0 {
- fmt.Print(".")
- }
- if (count % 10000) == 0 {
- fmt.Println()
- }
- filename := info.Name()
- if filename[0:1] != "." {
- if info.IsDir() {
- fmt.Println(path)
- if prune {
- pruneHash(path)
- }
- }
- if !info.IsDir() {
- addWork++
- calculateHash(path)
- if time.Since(startTime).Seconds() > 10.0 {
- startTime = time.Now()
- saveAllHashFiles()
- addWork = 0
- }
- }
- }
- return nil
- })
- if err != nil {
- panic(err)
- }
- }
- /* delete unused hash values from the hash file */
- func pruneHash(dir string) {
- _, err := os.Stat(dir + "/.fdhashes3")
- if !os.IsNotExist(err) {
- hashFile := loadHashfile(dir + "/.fdhashes3")
- for filename := range hashFile.Hashes {
- _, err := os.Stat(dir + "/" + filename)
- if os.IsNotExist(err) {
- delete(hashFile.Hashes, filename)
- delete(hashFile.Times, filename)
- hashFile.Dirty = true
- }
- }
- for filename := range hashFile.Times {
- _, err := os.Stat(dir + "/" + filename)
- if os.IsNotExist(err) {
- delete(hashFile.Hashes, filename)
- delete(hashFile.Times, filename)
- hashFile.Dirty = true
- }
- }
- saveHashfile(&hashFile)
- }
- }
- func dirtyHashfile(hashFile *Fdhashes) {
- hashFile.Dirty = true
- }
- func saveAllHashFiles() {
- hashList := make([]Fdhashes, 0)
- for _, hashFile := range hashes {
- if hashFile.Dirty {
- saveHashfile(&hashFile)
- hashList = append(hashList, hashFile)
- }
- }
- hashes = make(map[string]Fdhashes)
- for _, hashFile := range hashList {
- hashes[hashFile.Path] = hashFile
- }
- }
- func saveHashfile(hashFile *Fdhashes) {
- if hashFile.Dirty {
- hashFile.Dirty = false
- b, err := json.Marshal(hashFile)
- if err != nil {
- fmt.Println(err)
- return
- }
- err = ioutil.WriteFile(hashFile.Path+"/.fdhashes3", b, 0644)
- if err != nil {
- panic(err)
- }
- }
- }
- func loadHashfile(fileStr string) Fdhashes {
- dir, _ := filepath.Split(fileStr)
- dir = filepath.ToSlash(filepath.Clean(dir))
- data := Fdhashes{Path: dir, Hashes: make(map[string]string), Times: make(map[string]time.Time), Dirty: false}
- if !rewrite {
- file, err := ioutil.ReadFile(fileStr)
- if err != nil {
- panic(err)
- }
- err = json.Unmarshal([]byte(file), &data)
- if err != nil {
- log.Printf("can't read file %s", fileStr)
- }
- }
- if data.Path != dir {
- data.Path = dir
- data.Dirty = true
- }
- return data
- }
- func compareFolder(folder string) {
- loadIgnoreFile(ignores)
- loadAllHashFiles(folder)
- // putting all hashes into one big map key = hash, value list of files with that hash
- size := len(hashes)
- index := make(map[string][]string)
- count = 0
- for _, hashFile := range hashes {
- count++
- if count%100 == 0 {
- fmt.Printf("%d (%d) merging\n", count, size)
- }
- for filename, hash := range hashFile.Hashes {
- values := index[hash]
- if values == nil {
- values = make([]string, 0)
- }
- filepath := fmt.Sprintf("%s/%s", hashFile.Path, filename)
- pos := sort.SearchStrings(ignoreLines, filepath)
- if pos == len(ignoreLines) {
- _, err := os.Stat(filepath)
- if err == nil {
- values = append(values, filepath)
- index[hash] = values
- }
- }
- }
- }
- // sorting list of files for every hash and deleting hashes with only 1 entry
- size = len(index)
- myHashes := make([]string, 0)
- count = 0
- for hash, values := range index {
- count++
- if count%100 == 0 {
- fmt.Printf("%d (%d) sorting\n", count, size)
- }
- if len(values) > 1 {
- sort.Strings(values)
- index[hash] = values
- myHashes = append(myHashes, hash)
- // for _, filename := range values {
- // fmt.Printf(" %s\n", filename)
- // }
- } else {
- delete(index, hash)
- }
- }
- sort.Slice(myHashes, func(i, j int) bool { return index[myHashes[i]][0] < index[myHashes[j]][0] })
- if outputJson {
- size = len(index)
- var filesize int64
- fileCount := 0
- for _, hash := range myHashes {
- values := index[hash]
- count++
- if count%100 == 0 {
- fmt.Printf("%d (%d) checking\n", count, size)
- }
- if len(values) > 1 {
- info, err := os.Stat(values[0])
- if err == nil {
- fmt.Printf("found identically hash: %s size: %d\n", hash, info.Size())
- filesize += int64(len(values)-1) * info.Size()
- }
- fileCount += len(values) - 1
- for _, filename := range values {
- fmt.Printf(" %s\n", filename)
- }
- } else {
- delete(index, hash)
- }
- }
- b, err := json.Marshal(index)
- if err != nil {
- fmt.Println(err)
- return
- }
- err = ioutil.WriteFile(report, b, 0644)
- if err != nil {
- panic(err)
- }
- } else {
- size := len(index)
- f, err := os.Create(report)
- check(err)
- w := bufio.NewWriter(f)
- count := 0
- var filesize int64
- fileCount := 0
- for _, hash := range myHashes {
- values := index[hash]
- count++
- if count%100 == 0 {
- fmt.Printf("%d (%d) checking\n", count, size)
- }
- if len(values) > 1 {
- info, err := os.Stat(values[0])
- if err == nil {
- w.WriteString(fmt.Sprintf("found identically hash: size: %d\n", info.Size()))
- filesize += int64(len(values)-1) * info.Size()
- }
- fileCount += len(values) - 1
- for _, filename := range values {
- w.WriteString(fmt.Sprintf(" %s\n", filename))
- }
- w.Flush()
- }
- }
- w.WriteString(fmt.Sprintf("can save up to %s on %d files\n", bytefmt.ByteSize(uint64(filesize)), fileCount))
- w.Flush()
- }
- }
- func loadIgnoreFile(filename string) {
- content, err := ioutil.ReadFile(filename)
- if err == nil {
- ignoreLines = strings.Split(string(content), "\n")
- sort.Strings(ignoreLines)
- }
- }
- func search(srcHash string, exFilename string, exFilepath string) (value string, found bool) {
- for _, hashFile := range hashes {
- for filename, hash := range hashFile.Hashes {
- if (filename != exFilename) && (hashFile.Path != exFilepath) {
- if hash == srcHash {
- value += fmt.Sprintf("%s/%s;", hashFile.Path, filename)
- found = true
- }
- }
- }
- }
- return
- }
- func loadAllHashFiles(folder string) {
- count = 0
- addWork = 0
- err := filepath.Walk(folder, func(path string, info os.FileInfo, err error) error {
- if info != nil {
- if info.IsDir() {
- count++
- fmt.Print(".")
- if (count % 100) == 0 {
- fmt.Println()
- }
- hashFile, ok := hashes[path]
- if !ok {
- _, err := os.Stat(path + "/.fdhashes3")
- if os.IsNotExist(err) {
- hashFile = Fdhashes{Path: path, Hashes: make(map[string]string), Times: make(map[string]time.Time), Dirty: true}
- } else {
- hashFile = loadHashfile(path + "/.fdhashes3")
- }
- hashes[path] = hashFile
- }
- }
- }
- return nil
- })
- if err != nil {
- panic(err)
- }
- fmt.Printf("\nfound %d hash files.\n", len(hashes))
- }
- func check(e error) {
- if e != nil {
- panic(e)
- }
- }
|