GoHash.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464
  1. package main
  2. import (
  3. "bufio"
  4. "crypto/sha256"
  5. "encoding/hex"
  6. "encoding/json"
  7. "fmt"
  8. "io"
  9. "io/ioutil"
  10. "log"
  11. "os"
  12. "path/filepath"
  13. "runtime"
  14. "sort"
  15. "strings"
  16. "sync"
  17. "time"
  18. "code.cloudfoundry.org/bytefmt"
  19. flag "github.com/spf13/pflag"
  20. )
  21. // Fdhashes struct for holding all informations about one folder.
  22. type Fdhashes struct {
  23. Path string
  24. Hashes map[string]string
  25. Times map[string]time.Time
  26. Dirty bool
  27. }
  28. var hashes map[string]Fdhashes
  29. var ignoreLines []string
  30. var mu sync.RWMutex
  31. var driveLetter string
  32. var rewrite bool
  33. var prune bool
  34. var outputJson bool
  35. var report string
  36. var ignores string
  37. func init() {
  38. flag.BoolVarP(&rewrite, "rewrite", "r", false, "rewrite all fhhashes files.")
  39. flag.StringVarP(&report, "equals", "e", "", "compare all file hashes and writing a equlatity report.")
  40. flag.BoolVarP(&prune, "prune", "p", false, "checking all fdhashes files.")
  41. flag.BoolVarP(&outputJson, "json", "j", false, "output as json.")
  42. flag.StringVarP(&ignores, "ignores", "i", "", "list of files to ignore in report.")
  43. }
  44. func main() {
  45. log.Println("starting GoHash")
  46. ignoreLines = make([]string, 0)
  47. hashes = make(map[string]Fdhashes)
  48. flag.Parse()
  49. myFile := flag.Arg(0)
  50. if rewrite {
  51. log.Println("rewrite active")
  52. }
  53. if prune {
  54. log.Println("prune active")
  55. }
  56. if outputJson {
  57. log.Println("output json format active")
  58. }
  59. if report != "" {
  60. log.Println("report active, file: ", report)
  61. }
  62. if ignores != "" {
  63. log.Println("ignores file: ", ignores)
  64. }
  65. file, err := os.Stat(myFile)
  66. if os.IsNotExist(err) {
  67. log.Fatalln("File does not exists:", myFile)
  68. }
  69. if file.IsDir() {
  70. log.Println("start with folder:", myFile)
  71. driveLetter = ""
  72. if runtime.GOOS == "windows" {
  73. driveLetter = filepath.VolumeName(myFile) + "/"
  74. }
  75. if report != "" {
  76. compareFolder(myFile)
  77. } else {
  78. processFolder(myFile)
  79. saveAllHashFiles()
  80. }
  81. } else {
  82. log.Printf("file %s has hash %s\n", myFile, getSha256Hash(myFile))
  83. }
  84. log.Println("done")
  85. }
  86. func getSha256Hash(fileStr string) string {
  87. f, err := os.Open(fileStr)
  88. if err != nil {
  89. log.Fatal(err)
  90. }
  91. defer f.Close()
  92. h := sha256.New()
  93. if _, err := io.Copy(h, f); err != nil {
  94. log.Fatal(err)
  95. }
  96. return hex.EncodeToString(h.Sum(nil))
  97. }
  98. var lock1 = sync.RWMutex{}
  99. var lock2 = sync.RWMutex{}
  100. func calculateHash(fileStr string) {
  101. var hashFile Fdhashes
  102. doHash := true
  103. dir, fileName := filepath.Split(fileStr)
  104. if fileName == ".fdhashes3" {
  105. return
  106. }
  107. // checking if hash is present
  108. mu.Lock()
  109. hashFile, ok := hashes[dir]
  110. if !ok {
  111. _, err := os.Stat(dir + ".fdhashes3")
  112. if os.IsNotExist(err) {
  113. hashFile = Fdhashes{Path: dir, Hashes: make(map[string]string), Times: make(map[string]time.Time), Dirty: true}
  114. } else {
  115. hashFile = loadHashfile(dir + ".fdhashes3")
  116. }
  117. hashes[dir] = hashFile
  118. }
  119. lock1.RLock()
  120. _, ok = hashFile.Hashes[fileName]
  121. lock1.RUnlock()
  122. mu.Unlock()
  123. doHash = !ok
  124. // checking if dattime is identically
  125. file, _ := os.Stat(fileStr)
  126. time := file.ModTime()
  127. lock2.RLock()
  128. savedTime, ok := hashFile.Times[fileName]
  129. lock2.RUnlock()
  130. if !time.Equal(savedTime) || !ok {
  131. doHash = true
  132. }
  133. if doHash {
  134. log.Printf("starting %s\n", fileStr)
  135. hash := getSha256Hash(fileStr)
  136. log.Printf("ready %s\n", fileStr)
  137. mu.Lock()
  138. lock1.Lock()
  139. hashFile.Hashes[fileName] = hash
  140. lock1.Unlock()
  141. lock2.Lock()
  142. hashFile.Times[fileName] = time
  143. lock2.Unlock()
  144. dirtyHashfile(&hashFile)
  145. hashes[dir] = hashFile
  146. mu.Unlock()
  147. log.Printf("file \"%s\" has hash \"%s\"\n", fileStr, hash)
  148. }
  149. }
  150. var count int
  151. var addWork int
  152. var startTime time.Time
  153. func processFolder(folder string) {
  154. startTime = time.Now()
  155. count = 0
  156. addWork = 0
  157. err := filepath.Walk(folder, func(path string, info os.FileInfo, err error) error {
  158. count++
  159. if (count % 100) == 0 {
  160. fmt.Print(".")
  161. }
  162. if (count % 10000) == 0 {
  163. fmt.Println()
  164. }
  165. filename := info.Name()
  166. if filename[0:1] != "." {
  167. if info.IsDir() {
  168. fmt.Println(path)
  169. if prune {
  170. pruneHash(path)
  171. }
  172. }
  173. if !info.IsDir() {
  174. addWork++
  175. calculateHash(path)
  176. if time.Since(startTime).Seconds() > 10.0 {
  177. startTime = time.Now()
  178. saveAllHashFiles()
  179. addWork = 0
  180. }
  181. }
  182. }
  183. return nil
  184. })
  185. if err != nil {
  186. panic(err)
  187. }
  188. }
  189. /* delete unused hash values from the hash file */
  190. func pruneHash(dir string) {
  191. _, err := os.Stat(dir + "/.fdhashes3")
  192. if !os.IsNotExist(err) {
  193. hashFile := loadHashfile(dir + "/.fdhashes3")
  194. for filename := range hashFile.Hashes {
  195. _, err := os.Stat(dir + "/" + filename)
  196. if os.IsNotExist(err) {
  197. delete(hashFile.Hashes, filename)
  198. delete(hashFile.Times, filename)
  199. hashFile.Dirty = true
  200. }
  201. }
  202. for filename := range hashFile.Times {
  203. _, err := os.Stat(dir + "/" + filename)
  204. if os.IsNotExist(err) {
  205. delete(hashFile.Hashes, filename)
  206. delete(hashFile.Times, filename)
  207. hashFile.Dirty = true
  208. }
  209. }
  210. saveHashfile(&hashFile)
  211. }
  212. }
  213. func dirtyHashfile(hashFile *Fdhashes) {
  214. hashFile.Dirty = true
  215. }
  216. func saveAllHashFiles() {
  217. hashList := make([]Fdhashes, 0)
  218. for _, hashFile := range hashes {
  219. if hashFile.Dirty {
  220. saveHashfile(&hashFile)
  221. hashList = append(hashList, hashFile)
  222. }
  223. }
  224. hashes = make(map[string]Fdhashes)
  225. for _, hashFile := range hashList {
  226. hashes[hashFile.Path] = hashFile
  227. }
  228. }
  229. func saveHashfile(hashFile *Fdhashes) {
  230. if hashFile.Dirty {
  231. hashFile.Dirty = false
  232. b, err := json.Marshal(hashFile)
  233. if err != nil {
  234. fmt.Println(err)
  235. return
  236. }
  237. err = ioutil.WriteFile(hashFile.Path+"/.fdhashes3", b, 0644)
  238. if err != nil {
  239. panic(err)
  240. }
  241. }
  242. }
  243. func loadHashfile(fileStr string) Fdhashes {
  244. dir, _ := filepath.Split(fileStr)
  245. dir = filepath.ToSlash(filepath.Clean(dir))
  246. data := Fdhashes{Path: dir, Hashes: make(map[string]string), Times: make(map[string]time.Time), Dirty: false}
  247. if !rewrite {
  248. file, err := ioutil.ReadFile(fileStr)
  249. if err != nil {
  250. panic(err)
  251. }
  252. err = json.Unmarshal([]byte(file), &data)
  253. if err != nil {
  254. log.Printf("can't read file %s", fileStr)
  255. }
  256. }
  257. if data.Path != dir {
  258. data.Path = dir
  259. data.Dirty = true
  260. }
  261. return data
  262. }
  263. func compareFolder(folder string) {
  264. loadIgnoreFile(ignores)
  265. loadAllHashFiles(folder)
  266. // putting all hashes into one big map key = hash, value list of files with that hash
  267. size := len(hashes)
  268. index := make(map[string][]string)
  269. count = 0
  270. for _, hashFile := range hashes {
  271. count++
  272. if count%100 == 0 {
  273. fmt.Printf("%d (%d) merging\n", count, size)
  274. }
  275. for filename, hash := range hashFile.Hashes {
  276. values := index[hash]
  277. if values == nil {
  278. values = make([]string, 0)
  279. }
  280. filepath := fmt.Sprintf("%s/%s", hashFile.Path, filename)
  281. pos := sort.SearchStrings(ignoreLines, filepath)
  282. if pos == len(ignoreLines) {
  283. _, err := os.Stat(filepath)
  284. if err == nil {
  285. values = append(values, filepath)
  286. index[hash] = values
  287. }
  288. }
  289. }
  290. }
  291. // sorting list of files for every hash and deleting hashes with only 1 entry
  292. size = len(index)
  293. myHashes := make([]string, 0)
  294. count = 0
  295. for hash, values := range index {
  296. count++
  297. if count%100 == 0 {
  298. fmt.Printf("%d (%d) sorting\n", count, size)
  299. }
  300. if len(values) > 1 {
  301. sort.Strings(values)
  302. index[hash] = values
  303. myHashes = append(myHashes, hash)
  304. // for _, filename := range values {
  305. // fmt.Printf(" %s\n", filename)
  306. // }
  307. } else {
  308. delete(index, hash)
  309. }
  310. }
  311. sort.Slice(myHashes, func(i, j int) bool { return index[myHashes[i]][0] < index[myHashes[j]][0] })
  312. if outputJson {
  313. size = len(index)
  314. var filesize int64
  315. fileCount := 0
  316. for _, hash := range myHashes {
  317. values := index[hash]
  318. count++
  319. if count%100 == 0 {
  320. fmt.Printf("%d (%d) checking\n", count, size)
  321. }
  322. if len(values) > 1 {
  323. info, err := os.Stat(values[0])
  324. if err == nil {
  325. fmt.Printf("found identically hash: %s size: %d\n", hash, info.Size())
  326. filesize += int64(len(values)-1) * info.Size()
  327. }
  328. fileCount += len(values) - 1
  329. for _, filename := range values {
  330. fmt.Printf(" %s\n", filename)
  331. }
  332. } else {
  333. delete(index, hash)
  334. }
  335. }
  336. b, err := json.Marshal(index)
  337. if err != nil {
  338. fmt.Println(err)
  339. return
  340. }
  341. err = ioutil.WriteFile(report, b, 0644)
  342. if err != nil {
  343. panic(err)
  344. }
  345. } else {
  346. size := len(index)
  347. f, err := os.Create(report)
  348. check(err)
  349. w := bufio.NewWriter(f)
  350. count := 0
  351. var filesize int64
  352. fileCount := 0
  353. for _, hash := range myHashes {
  354. values := index[hash]
  355. count++
  356. if count%100 == 0 {
  357. fmt.Printf("%d (%d) checking\n", count, size)
  358. }
  359. if len(values) > 1 {
  360. info, err := os.Stat(values[0])
  361. if err == nil {
  362. w.WriteString(fmt.Sprintf("found identically hash: size: %d\n", info.Size()))
  363. filesize += int64(len(values)-1) * info.Size()
  364. }
  365. fileCount += len(values) - 1
  366. for _, filename := range values {
  367. w.WriteString(fmt.Sprintf(" %s\n", filename))
  368. }
  369. w.Flush()
  370. }
  371. }
  372. w.WriteString(fmt.Sprintf("can save up to %s on %d files\n", bytefmt.ByteSize(uint64(filesize)), fileCount))
  373. w.Flush()
  374. }
  375. }
  376. func loadIgnoreFile(filename string) {
  377. content, err := ioutil.ReadFile(filename)
  378. if err == nil {
  379. ignoreLines = strings.Split(string(content), "\n")
  380. sort.Strings(ignoreLines)
  381. }
  382. }
  383. func search(srcHash string, exFilename string, exFilepath string) (value string, found bool) {
  384. for _, hashFile := range hashes {
  385. for filename, hash := range hashFile.Hashes {
  386. if (filename != exFilename) && (hashFile.Path != exFilepath) {
  387. if hash == srcHash {
  388. value += fmt.Sprintf("%s/%s;", hashFile.Path, filename)
  389. found = true
  390. }
  391. }
  392. }
  393. }
  394. return
  395. }
  396. func loadAllHashFiles(folder string) {
  397. count = 0
  398. addWork = 0
  399. err := filepath.Walk(folder, func(path string, info os.FileInfo, err error) error {
  400. if info != nil {
  401. if info.IsDir() {
  402. count++
  403. fmt.Print(".")
  404. if (count % 100) == 0 {
  405. fmt.Println()
  406. }
  407. hashFile, ok := hashes[path]
  408. if !ok {
  409. _, err := os.Stat(path + "/.fdhashes3")
  410. if os.IsNotExist(err) {
  411. hashFile = Fdhashes{Path: path, Hashes: make(map[string]string), Times: make(map[string]time.Time), Dirty: true}
  412. } else {
  413. hashFile = loadHashfile(path + "/.fdhashes3")
  414. }
  415. hashes[path] = hashFile
  416. }
  417. }
  418. }
  419. return nil
  420. })
  421. if err != nil {
  422. panic(err)
  423. }
  424. fmt.Printf("\nfound %d hash files.\n", len(hashes))
  425. }
  426. func check(e error) {
  427. if e != nil {
  428. panic(e)
  429. }
  430. }