GoHash.go 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399
  1. package main
  2. import (
  3. "bufio"
  4. "crypto/sha256"
  5. "encoding/hex"
  6. "encoding/json"
  7. "fmt"
  8. "io"
  9. "io/ioutil"
  10. "log"
  11. "os"
  12. "path/filepath"
  13. "runtime"
  14. "sync"
  15. "time"
  16. "code.cloudfoundry.org/bytefmt"
  17. flag "github.com/spf13/pflag"
  18. )
  19. // Fdhashes struct for holding all informations about one folder.
  20. type Fdhashes struct {
  21. Path string
  22. Hashes map[string]string
  23. Times map[string]time.Time
  24. Dirty bool
  25. }
  26. var hashes map[string]Fdhashes
  27. var mu sync.RWMutex
  28. var driveLetter string
  29. var rewrite bool
  30. var prune bool
  31. var outputJson bool
  32. var report string
  33. func init() {
  34. flag.BoolVarP(&rewrite, "rewrite", "r", false, "rewrite all fhhashes files.")
  35. flag.StringVarP(&report, "equals", "e", "", "compare all file hashes and writing a equlatity report.")
  36. flag.BoolVarP(&prune, "prune", "p", false, "checking all fdhashes files.")
  37. flag.BoolVarP(&outputJson, "json", "j", false, "output as json.")
  38. }
  39. func main() {
  40. log.Println("starting GoHash")
  41. hashes = make(map[string]Fdhashes)
  42. flag.Parse()
  43. myFile := flag.Arg(0)
  44. file, err := os.Stat(myFile)
  45. if os.IsNotExist(err) {
  46. log.Fatalln("File does not exists:", myFile)
  47. }
  48. if file.IsDir() {
  49. log.Println("start with folder:", myFile)
  50. driveLetter = ""
  51. if runtime.GOOS == "windows" {
  52. driveLetter = filepath.VolumeName(myFile) + "/"
  53. }
  54. if report != "" {
  55. compareFolder(myFile)
  56. } else {
  57. processFolder(myFile)
  58. saveAllHashFiles()
  59. }
  60. } else {
  61. log.Printf("file %s has hash %s\n", myFile, getSha256Hash(myFile))
  62. }
  63. log.Println("done")
  64. }
  65. func getSha256Hash(fileStr string) string {
  66. f, err := os.Open(fileStr)
  67. if err != nil {
  68. log.Fatal(err)
  69. }
  70. defer f.Close()
  71. h := sha256.New()
  72. if _, err := io.Copy(h, f); err != nil {
  73. log.Fatal(err)
  74. }
  75. return hex.EncodeToString(h.Sum(nil))
  76. }
  77. var lock1 = sync.RWMutex{}
  78. var lock2 = sync.RWMutex{}
  79. func calculateHash(fileStr string) {
  80. var hashFile Fdhashes
  81. doHash := true
  82. dir, fileName := filepath.Split(fileStr)
  83. if fileName == ".fdhashes3" {
  84. return
  85. }
  86. // checking if hash is present
  87. mu.Lock()
  88. hashFile, ok := hashes[dir]
  89. if !ok {
  90. _, err := os.Stat(dir + ".fdhashes3")
  91. if os.IsNotExist(err) {
  92. hashFile = Fdhashes{Path: dir, Hashes: make(map[string]string), Times: make(map[string]time.Time), Dirty: true}
  93. } else {
  94. hashFile = loadHashfile(dir + ".fdhashes3")
  95. }
  96. hashes[dir] = hashFile
  97. }
  98. lock1.RLock()
  99. _, ok = hashFile.Hashes[fileName]
  100. lock1.RUnlock()
  101. mu.Unlock()
  102. doHash = !ok
  103. // checking if dattime is identically
  104. file, _ := os.Stat(fileStr)
  105. time := file.ModTime()
  106. lock2.RLock()
  107. savedTime, ok := hashFile.Times[fileName]
  108. lock2.RUnlock()
  109. if !time.Equal(savedTime) || !ok {
  110. doHash = true
  111. }
  112. if doHash {
  113. log.Printf("starting %s\n", fileStr)
  114. hash := getSha256Hash(fileStr)
  115. log.Printf("ready %s\n", fileStr)
  116. mu.Lock()
  117. lock1.Lock()
  118. hashFile.Hashes[fileName] = hash
  119. lock1.Unlock()
  120. lock2.Lock()
  121. hashFile.Times[fileName] = time
  122. lock2.Unlock()
  123. dirtyHashfile(&hashFile)
  124. hashes[dir] = hashFile
  125. mu.Unlock()
  126. log.Printf("file \"%s\" has hash \"%s\"\n", fileStr, hash)
  127. }
  128. }
  129. var count int
  130. var addWork int
  131. var startTime time.Time
  132. func processFolder(folder string) {
  133. startTime = time.Now()
  134. count = 0
  135. addWork = 0
  136. err := filepath.Walk(folder, func(path string, info os.FileInfo, err error) error {
  137. count++
  138. if (count % 100) == 0 {
  139. fmt.Print(".")
  140. }
  141. if (count % 10000) == 0 {
  142. fmt.Println()
  143. }
  144. filename := info.Name()
  145. if filename[0:1] != "." {
  146. if info.IsDir() {
  147. fmt.Println(path)
  148. if prune {
  149. pruneHash(path)
  150. }
  151. }
  152. if !info.IsDir() {
  153. addWork++
  154. calculateHash(path)
  155. if time.Since(startTime).Seconds() > 10.0 {
  156. startTime = time.Now()
  157. saveAllHashFiles()
  158. addWork = 0
  159. }
  160. }
  161. }
  162. return nil
  163. })
  164. if err != nil {
  165. panic(err)
  166. }
  167. }
  168. /* delete unused hash values from the hash file */
  169. func pruneHash(dir string) {
  170. _, err := os.Stat(dir + "/.fdhashes3")
  171. if !os.IsNotExist(err) {
  172. hashFile := loadHashfile(dir + "/.fdhashes3")
  173. for filename := range hashFile.Hashes {
  174. _, err := os.Stat(dir + "/" + filename)
  175. if os.IsNotExist(err) {
  176. delete(hashFile.Hashes, filename)
  177. delete(hashFile.Times, filename)
  178. hashFile.Dirty = true
  179. }
  180. }
  181. for filename := range hashFile.Times {
  182. _, err := os.Stat(dir + "/" + filename)
  183. if os.IsNotExist(err) {
  184. delete(hashFile.Hashes, filename)
  185. delete(hashFile.Times, filename)
  186. hashFile.Dirty = true
  187. }
  188. }
  189. saveHashfile(&hashFile)
  190. }
  191. }
  192. func dirtyHashfile(hashFile *Fdhashes) {
  193. hashFile.Dirty = true
  194. }
  195. func saveAllHashFiles() {
  196. hashList := make([]Fdhashes, 0)
  197. for _, hashFile := range hashes {
  198. if hashFile.Dirty {
  199. saveHashfile(&hashFile)
  200. hashList = append(hashList, hashFile)
  201. }
  202. }
  203. hashes = make(map[string]Fdhashes)
  204. for _, hashFile := range hashList {
  205. hashes[hashFile.Path] = hashFile
  206. }
  207. }
  208. func saveHashfile(hashFile *Fdhashes) {
  209. if hashFile.Dirty {
  210. hashFile.Dirty = false
  211. b, err := json.Marshal(hashFile)
  212. if err != nil {
  213. fmt.Println(err)
  214. return
  215. }
  216. err = ioutil.WriteFile(hashFile.Path+"/.fdhashes3", b, 0644)
  217. if err != nil {
  218. panic(err)
  219. }
  220. }
  221. }
  222. func loadHashfile(fileStr string) Fdhashes {
  223. dir, _ := filepath.Split(fileStr)
  224. dir = filepath.ToSlash(filepath.Clean(dir))
  225. data := Fdhashes{Path: dir, Hashes: make(map[string]string), Times: make(map[string]time.Time), Dirty: false}
  226. if !rewrite {
  227. file, err := ioutil.ReadFile(fileStr)
  228. if err != nil {
  229. panic(err)
  230. }
  231. err = json.Unmarshal([]byte(file), &data)
  232. if err != nil {
  233. log.Printf("can't read file %s", fileStr)
  234. }
  235. }
  236. if data.Path != dir {
  237. data.Path = dir
  238. data.Dirty = true
  239. }
  240. return data
  241. }
  242. func compareFolder(folder string) {
  243. loadAllHashFiles(folder)
  244. index := make(map[string][]string)
  245. for _, hashFile := range hashes {
  246. for filename, hash := range hashFile.Hashes {
  247. values := index[hash]
  248. if values == nil {
  249. values = make([]string, 0)
  250. }
  251. values = append(values, fmt.Sprintf("%s/%s", hashFile.Path, filename))
  252. index[hash] = values
  253. }
  254. }
  255. if outputJson {
  256. size := len(index)
  257. var filesize int64
  258. fileCount := 0
  259. for hash, values := range index {
  260. count++
  261. if count%100 == 0 {
  262. fmt.Printf("%d (%d) checking\n", count, size)
  263. }
  264. if len(values) > 1 {
  265. info, err := os.Stat(values[0])
  266. if err == nil {
  267. fmt.Printf("found identically hash: %s size: %d\n", hash, info.Size())
  268. filesize += int64(len(values)-1) * info.Size()
  269. }
  270. fileCount += len(values) - 1
  271. for _, filename := range values {
  272. fmt.Printf(" %s\n", filename)
  273. }
  274. } else {
  275. delete(index, hash)
  276. }
  277. }
  278. b, err := json.Marshal(index)
  279. if err != nil {
  280. fmt.Println(err)
  281. return
  282. }
  283. err = ioutil.WriteFile(report, b, 0644)
  284. if err != nil {
  285. panic(err)
  286. }
  287. } else {
  288. size := len(index)
  289. f, err := os.Create(report)
  290. check(err)
  291. w := bufio.NewWriter(f)
  292. count := 0
  293. var filesize int64
  294. fileCount := 0
  295. for _, values := range index {
  296. count++
  297. if count%100 == 0 {
  298. fmt.Printf("%d (%d) checking\n", count, size)
  299. }
  300. if len(values) > 1 {
  301. info, err := os.Stat(values[0])
  302. if err == nil {
  303. w.WriteString(fmt.Sprintf("found identically hash: size: %d\n", info.Size()))
  304. filesize += int64(len(values)-1) * info.Size()
  305. }
  306. fileCount += len(values) - 1
  307. for _, filename := range values {
  308. w.WriteString(fmt.Sprintf(" %s\n", filename))
  309. }
  310. w.Flush()
  311. }
  312. }
  313. w.WriteString(fmt.Sprintf("can save up to %s on %d files\n", bytefmt.ByteSize(uint64(filesize)), fileCount))
  314. w.Flush()
  315. }
  316. }
  317. func search(srcHash string, exFilename string, exFilepath string) (value string, found bool) {
  318. for _, hashFile := range hashes {
  319. for filename, hash := range hashFile.Hashes {
  320. if (filename != exFilename) && (hashFile.Path != exFilepath) {
  321. if hash == srcHash {
  322. value += fmt.Sprintf("%s/%s;", hashFile.Path, filename)
  323. found = true
  324. }
  325. }
  326. }
  327. }
  328. return
  329. }
  330. func loadAllHashFiles(folder string) {
  331. count = 0
  332. addWork = 0
  333. err := filepath.Walk(folder, func(path string, info os.FileInfo, err error) error {
  334. if info != nil {
  335. if info.IsDir() {
  336. count++
  337. fmt.Print(".")
  338. if (count % 100) == 0 {
  339. fmt.Println()
  340. }
  341. hashFile, ok := hashes[path]
  342. if !ok {
  343. _, err := os.Stat(path + "/.fdhashes3")
  344. if os.IsNotExist(err) {
  345. hashFile = Fdhashes{Path: path, Hashes: make(map[string]string), Times: make(map[string]time.Time), Dirty: true}
  346. } else {
  347. hashFile = loadHashfile(path + "/.fdhashes3")
  348. }
  349. hashes[path] = hashFile
  350. }
  351. }
  352. }
  353. return nil
  354. })
  355. if err != nil {
  356. panic(err)
  357. }
  358. fmt.Printf("\nfound %d hash files.\n", len(hashes))
  359. }
  360. func check(e error) {
  361. if e != nil {
  362. panic(e)
  363. }
  364. }