GoHash.go 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434
  1. package main
  2. import (
  3. "bufio"
  4. "crypto/sha256"
  5. "encoding/hex"
  6. "encoding/json"
  7. "fmt"
  8. "io"
  9. "io/ioutil"
  10. "log"
  11. "os"
  12. "path/filepath"
  13. "runtime"
  14. "sort"
  15. "sync"
  16. "time"
  17. "code.cloudfoundry.org/bytefmt"
  18. flag "github.com/spf13/pflag"
  19. )
  20. // Fdhashes struct for holding all informations about one folder.
  21. type Fdhashes struct {
  22. Path string
  23. Hashes map[string]string
  24. Times map[string]time.Time
  25. Dirty bool
  26. }
  27. var hashes map[string]Fdhashes
  28. var mu sync.RWMutex
  29. var driveLetter string
  30. var rewrite bool
  31. var prune bool
  32. var outputJson bool
  33. var report string
  34. func init() {
  35. flag.BoolVarP(&rewrite, "rewrite", "r", false, "rewrite all fhhashes files.")
  36. flag.StringVarP(&report, "equals", "e", "", "compare all file hashes and writing a equlatity report.")
  37. flag.BoolVarP(&prune, "prune", "p", false, "checking all fdhashes files.")
  38. flag.BoolVarP(&outputJson, "json", "j", false, "output as json.")
  39. }
  40. func main() {
  41. log.Println("starting GoHash")
  42. hashes = make(map[string]Fdhashes)
  43. flag.Parse()
  44. myFile := flag.Arg(0)
  45. file, err := os.Stat(myFile)
  46. if os.IsNotExist(err) {
  47. log.Fatalln("File does not exists:", myFile)
  48. }
  49. if file.IsDir() {
  50. log.Println("start with folder:", myFile)
  51. driveLetter = ""
  52. if runtime.GOOS == "windows" {
  53. driveLetter = filepath.VolumeName(myFile) + "/"
  54. }
  55. if report != "" {
  56. compareFolder(myFile)
  57. } else {
  58. processFolder(myFile)
  59. saveAllHashFiles()
  60. }
  61. } else {
  62. log.Printf("file %s has hash %s\n", myFile, getSha256Hash(myFile))
  63. }
  64. log.Println("done")
  65. }
  66. func getSha256Hash(fileStr string) string {
  67. f, err := os.Open(fileStr)
  68. if err != nil {
  69. log.Fatal(err)
  70. }
  71. defer f.Close()
  72. h := sha256.New()
  73. if _, err := io.Copy(h, f); err != nil {
  74. log.Fatal(err)
  75. }
  76. return hex.EncodeToString(h.Sum(nil))
  77. }
  78. var lock1 = sync.RWMutex{}
  79. var lock2 = sync.RWMutex{}
  80. func calculateHash(fileStr string) {
  81. var hashFile Fdhashes
  82. doHash := true
  83. dir, fileName := filepath.Split(fileStr)
  84. if fileName == ".fdhashes3" {
  85. return
  86. }
  87. // checking if hash is present
  88. mu.Lock()
  89. hashFile, ok := hashes[dir]
  90. if !ok {
  91. _, err := os.Stat(dir + ".fdhashes3")
  92. if os.IsNotExist(err) {
  93. hashFile = Fdhashes{Path: dir, Hashes: make(map[string]string), Times: make(map[string]time.Time), Dirty: true}
  94. } else {
  95. hashFile = loadHashfile(dir + ".fdhashes3")
  96. }
  97. hashes[dir] = hashFile
  98. }
  99. lock1.RLock()
  100. _, ok = hashFile.Hashes[fileName]
  101. lock1.RUnlock()
  102. mu.Unlock()
  103. doHash = !ok
  104. // checking if dattime is identically
  105. file, _ := os.Stat(fileStr)
  106. time := file.ModTime()
  107. lock2.RLock()
  108. savedTime, ok := hashFile.Times[fileName]
  109. lock2.RUnlock()
  110. if !time.Equal(savedTime) || !ok {
  111. doHash = true
  112. }
  113. if doHash {
  114. log.Printf("starting %s\n", fileStr)
  115. hash := getSha256Hash(fileStr)
  116. log.Printf("ready %s\n", fileStr)
  117. mu.Lock()
  118. lock1.Lock()
  119. hashFile.Hashes[fileName] = hash
  120. lock1.Unlock()
  121. lock2.Lock()
  122. hashFile.Times[fileName] = time
  123. lock2.Unlock()
  124. dirtyHashfile(&hashFile)
  125. hashes[dir] = hashFile
  126. mu.Unlock()
  127. log.Printf("file \"%s\" has hash \"%s\"\n", fileStr, hash)
  128. }
  129. }
  130. var count int
  131. var addWork int
  132. var startTime time.Time
  133. func processFolder(folder string) {
  134. startTime = time.Now()
  135. count = 0
  136. addWork = 0
  137. err := filepath.Walk(folder, func(path string, info os.FileInfo, err error) error {
  138. count++
  139. if (count % 100) == 0 {
  140. fmt.Print(".")
  141. }
  142. if (count % 10000) == 0 {
  143. fmt.Println()
  144. }
  145. filename := info.Name()
  146. if filename[0:1] != "." {
  147. if info.IsDir() {
  148. fmt.Println(path)
  149. if prune {
  150. pruneHash(path)
  151. }
  152. }
  153. if !info.IsDir() {
  154. addWork++
  155. calculateHash(path)
  156. if time.Since(startTime).Seconds() > 10.0 {
  157. startTime = time.Now()
  158. saveAllHashFiles()
  159. addWork = 0
  160. }
  161. }
  162. }
  163. return nil
  164. })
  165. if err != nil {
  166. panic(err)
  167. }
  168. }
  169. /* delete unused hash values from the hash file */
  170. func pruneHash(dir string) {
  171. _, err := os.Stat(dir + "/.fdhashes3")
  172. if !os.IsNotExist(err) {
  173. hashFile := loadHashfile(dir + "/.fdhashes3")
  174. for filename := range hashFile.Hashes {
  175. _, err := os.Stat(dir + "/" + filename)
  176. if os.IsNotExist(err) {
  177. delete(hashFile.Hashes, filename)
  178. delete(hashFile.Times, filename)
  179. hashFile.Dirty = true
  180. }
  181. }
  182. for filename := range hashFile.Times {
  183. _, err := os.Stat(dir + "/" + filename)
  184. if os.IsNotExist(err) {
  185. delete(hashFile.Hashes, filename)
  186. delete(hashFile.Times, filename)
  187. hashFile.Dirty = true
  188. }
  189. }
  190. saveHashfile(&hashFile)
  191. }
  192. }
  193. func dirtyHashfile(hashFile *Fdhashes) {
  194. hashFile.Dirty = true
  195. }
  196. func saveAllHashFiles() {
  197. hashList := make([]Fdhashes, 0)
  198. for _, hashFile := range hashes {
  199. if hashFile.Dirty {
  200. saveHashfile(&hashFile)
  201. hashList = append(hashList, hashFile)
  202. }
  203. }
  204. hashes = make(map[string]Fdhashes)
  205. for _, hashFile := range hashList {
  206. hashes[hashFile.Path] = hashFile
  207. }
  208. }
  209. func saveHashfile(hashFile *Fdhashes) {
  210. if hashFile.Dirty {
  211. hashFile.Dirty = false
  212. b, err := json.Marshal(hashFile)
  213. if err != nil {
  214. fmt.Println(err)
  215. return
  216. }
  217. err = ioutil.WriteFile(hashFile.Path+"/.fdhashes3", b, 0644)
  218. if err != nil {
  219. panic(err)
  220. }
  221. }
  222. }
  223. func loadHashfile(fileStr string) Fdhashes {
  224. dir, _ := filepath.Split(fileStr)
  225. dir = filepath.ToSlash(filepath.Clean(dir))
  226. data := Fdhashes{Path: dir, Hashes: make(map[string]string), Times: make(map[string]time.Time), Dirty: false}
  227. if !rewrite {
  228. file, err := ioutil.ReadFile(fileStr)
  229. if err != nil {
  230. panic(err)
  231. }
  232. err = json.Unmarshal([]byte(file), &data)
  233. if err != nil {
  234. log.Printf("can't read file %s", fileStr)
  235. }
  236. }
  237. if data.Path != dir {
  238. data.Path = dir
  239. data.Dirty = true
  240. }
  241. return data
  242. }
  243. func compareFolder(folder string) {
  244. loadAllHashFiles(folder)
  245. // putting all hashes into one big map key = hash, value list of files with that hash
  246. size := len(hashes)
  247. index := make(map[string][]string)
  248. count = 0
  249. for _, hashFile := range hashes {
  250. count++
  251. if count%100 == 0 {
  252. fmt.Printf("%d (%d) merging\n", count, size)
  253. }
  254. for filename, hash := range hashFile.Hashes {
  255. values := index[hash]
  256. if values == nil {
  257. values = make([]string, 0)
  258. }
  259. filepath := fmt.Sprintf("%s/%s", hashFile.Path, filename)
  260. _, err := os.Stat(filepath)
  261. if err == nil {
  262. values = append(values, filepath)
  263. index[hash] = values
  264. }
  265. }
  266. }
  267. // sorting list of files for every hash and deleting hashes with only 1 entry
  268. size = len(index)
  269. myHashes := make([]string, 0)
  270. count = 0
  271. for hash, values := range index {
  272. count++
  273. if count%100 == 0 {
  274. fmt.Printf("%d (%d) sorting\n", count, size)
  275. }
  276. if len(values) > 1 {
  277. sort.Strings(values)
  278. index[hash] = values
  279. myHashes = append(myHashes, hash)
  280. // for _, filename := range values {
  281. // fmt.Printf(" %s\n", filename)
  282. // }
  283. } else {
  284. delete(index, hash)
  285. }
  286. }
  287. sort.Slice(myHashes, func(i, j int) bool { return index[myHashes[i]][0] < index[myHashes[j]][0] })
  288. if outputJson {
  289. size = len(index)
  290. var filesize int64
  291. fileCount := 0
  292. for _, hash := range myHashes {
  293. values := index[hash]
  294. count++
  295. if count%100 == 0 {
  296. fmt.Printf("%d (%d) checking\n", count, size)
  297. }
  298. if len(values) > 1 {
  299. info, err := os.Stat(values[0])
  300. if err == nil {
  301. fmt.Printf("found identically hash: %s size: %d\n", hash, info.Size())
  302. filesize += int64(len(values)-1) * info.Size()
  303. }
  304. fileCount += len(values) - 1
  305. for _, filename := range values {
  306. fmt.Printf(" %s\n", filename)
  307. }
  308. } else {
  309. delete(index, hash)
  310. }
  311. }
  312. b, err := json.Marshal(index)
  313. if err != nil {
  314. fmt.Println(err)
  315. return
  316. }
  317. err = ioutil.WriteFile(report, b, 0644)
  318. if err != nil {
  319. panic(err)
  320. }
  321. } else {
  322. size := len(index)
  323. f, err := os.Create(report)
  324. check(err)
  325. w := bufio.NewWriter(f)
  326. count := 0
  327. var filesize int64
  328. fileCount := 0
  329. for _, hash := range myHashes {
  330. values := index[hash]
  331. count++
  332. if count%100 == 0 {
  333. fmt.Printf("%d (%d) checking\n", count, size)
  334. }
  335. if len(values) > 1 {
  336. info, err := os.Stat(values[0])
  337. if err == nil {
  338. w.WriteString(fmt.Sprintf("found identically hash: size: %d\n", info.Size()))
  339. filesize += int64(len(values)-1) * info.Size()
  340. }
  341. fileCount += len(values) - 1
  342. for _, filename := range values {
  343. w.WriteString(fmt.Sprintf(" %s\n", filename))
  344. }
  345. w.Flush()
  346. }
  347. }
  348. w.WriteString(fmt.Sprintf("can save up to %s on %d files\n", bytefmt.ByteSize(uint64(filesize)), fileCount))
  349. w.Flush()
  350. }
  351. }
  352. func search(srcHash string, exFilename string, exFilepath string) (value string, found bool) {
  353. for _, hashFile := range hashes {
  354. for filename, hash := range hashFile.Hashes {
  355. if (filename != exFilename) && (hashFile.Path != exFilepath) {
  356. if hash == srcHash {
  357. value += fmt.Sprintf("%s/%s;", hashFile.Path, filename)
  358. found = true
  359. }
  360. }
  361. }
  362. }
  363. return
  364. }
  365. func loadAllHashFiles(folder string) {
  366. count = 0
  367. addWork = 0
  368. err := filepath.Walk(folder, func(path string, info os.FileInfo, err error) error {
  369. if info != nil {
  370. if info.IsDir() {
  371. count++
  372. fmt.Print(".")
  373. if (count % 100) == 0 {
  374. fmt.Println()
  375. }
  376. hashFile, ok := hashes[path]
  377. if !ok {
  378. _, err := os.Stat(path + "/.fdhashes3")
  379. if os.IsNotExist(err) {
  380. hashFile = Fdhashes{Path: path, Hashes: make(map[string]string), Times: make(map[string]time.Time), Dirty: true}
  381. } else {
  382. hashFile = loadHashfile(path + "/.fdhashes3")
  383. }
  384. hashes[path] = hashFile
  385. }
  386. }
  387. }
  388. return nil
  389. })
  390. if err != nil {
  391. panic(err)
  392. }
  393. fmt.Printf("\nfound %d hash files.\n", len(hashes))
  394. }
  395. func check(e error) {
  396. if e != nil {
  397. panic(e)
  398. }
  399. }