main.go 9.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416
  1. package main
  2. import (
  3. "bufio"
  4. "encoding/json"
  5. "fmt"
  6. "io/ioutil"
  7. "log"
  8. "os"
  9. "path/filepath"
  10. "sort"
  11. "strings"
  12. "sync"
  13. "time"
  14. "code.cloudfoundry.org/bytefmt"
  15. flag "github.com/spf13/pflag"
  16. "wkla.no-ip.biz/gogs/Willie/GoHash/pkg/hash"
  17. )
  18. var hashes map[string]hash.Fdhashes
  19. var ignoreLines []string
  20. var mu sync.RWMutex
  21. var rewrite bool
  22. var prune bool
  23. var outputJson bool
  24. var report string
  25. var ignores string
  26. func init() {
  27. flag.BoolVarP(&rewrite, "rewrite", "r", false, "rewrite all fhhashes files.")
  28. flag.StringVarP(&report, "equals", "e", "", "compare all file hashes and writing a equlatity report.")
  29. flag.BoolVarP(&prune, "prune", "p", false, "checking all fdhashes files.")
  30. flag.BoolVarP(&outputJson, "json", "j", false, "output as json.")
  31. flag.StringVarP(&ignores, "ignores", "i", "", "list of files to ignore in report.")
  32. }
  33. func main() {
  34. log.Println("starting GoHash")
  35. ignoreLines = make([]string, 0)
  36. hashes = make(map[string]hash.Fdhashes)
  37. flag.Parse()
  38. myFile := flag.Arg(0)
  39. if rewrite {
  40. log.Println("rewrite active")
  41. }
  42. if prune {
  43. log.Println("prune active")
  44. }
  45. if outputJson {
  46. log.Println("output json format active")
  47. }
  48. if report != "" {
  49. log.Println("report active, file: ", report)
  50. }
  51. if ignores != "" {
  52. log.Println("ignores file: ", ignores)
  53. }
  54. file, err := os.Stat(myFile)
  55. if os.IsNotExist(err) {
  56. log.Fatalln("File does not exists:", myFile)
  57. }
  58. if file.IsDir() {
  59. log.Println("start with folder:", myFile)
  60. if report != "" {
  61. compareFolder(myFile)
  62. } else {
  63. processFolder(myFile)
  64. saveAllHashFiles()
  65. }
  66. } else {
  67. log.Printf("file %s has hash %s\n", myFile, hash.GetFileHash(myFile))
  68. }
  69. log.Println("done")
  70. }
  71. var lock1 = sync.RWMutex{}
  72. var lock2 = sync.RWMutex{}
  73. func calculateHash(fileStr string) {
  74. var hashFile hash.Fdhashes
  75. doHash := true
  76. dir, fileName := filepath.Split(fileStr)
  77. if fileName == ".fdhashes3" {
  78. return
  79. }
  80. // checking if hash is present
  81. mu.Lock()
  82. hashFile, ok := hashes[dir]
  83. if !ok {
  84. _, err := os.Stat(dir + ".fdhashes3")
  85. if os.IsNotExist(err) || rewrite {
  86. hashFile = hash.Fdhashes{Path: dir, Hashes: make(map[string]string), Times: make(map[string]time.Time), Dirty: true}
  87. } else {
  88. hf, err := hash.LoadHashfile(dir + ".fdhashes3")
  89. check(err)
  90. hashFile = *hf
  91. }
  92. hashes[dir] = hashFile
  93. }
  94. lock1.RLock()
  95. _, ok = hashFile.Hashes[fileName]
  96. lock1.RUnlock()
  97. mu.Unlock()
  98. doHash = !ok
  99. // checking if dattime is identically
  100. file, _ := os.Stat(fileStr)
  101. time := file.ModTime()
  102. lock2.RLock()
  103. savedTime, ok := hashFile.Times[fileName]
  104. lock2.RUnlock()
  105. if !time.Equal(savedTime) || !ok {
  106. doHash = true
  107. }
  108. if doHash {
  109. log.Printf("starting %s\n", fileStr)
  110. hash := hash.GetFileHash(fileStr)
  111. log.Printf("ready %s\n", fileStr)
  112. mu.Lock()
  113. lock1.Lock()
  114. hashFile.Hashes[fileName] = hash
  115. lock1.Unlock()
  116. lock2.Lock()
  117. hashFile.Times[fileName] = time
  118. lock2.Unlock()
  119. dirtyHashfile(&hashFile)
  120. hashes[dir] = hashFile
  121. mu.Unlock()
  122. log.Printf("file \"%s\" has hash \"%s\"\n", fileStr, hash)
  123. }
  124. }
  125. var count int
  126. var addWork int
  127. var startTime time.Time
  128. func processFolder(folder string) {
  129. startTime = time.Now()
  130. count = 0
  131. addWork = 0
  132. err := filepath.Walk(folder, func(path string, info os.FileInfo, err error) error {
  133. count++
  134. if (count % 100) == 0 {
  135. fmt.Print(".")
  136. }
  137. if (count % 10000) == 0 {
  138. fmt.Println()
  139. }
  140. filename := info.Name()
  141. if filename[0:1] != "." {
  142. if info.IsDir() {
  143. fmt.Println(path)
  144. if prune {
  145. pruneHash(path)
  146. }
  147. }
  148. if !info.IsDir() {
  149. addWork++
  150. calculateHash(path)
  151. if time.Since(startTime).Seconds() > 10.0 {
  152. startTime = time.Now()
  153. saveAllHashFiles()
  154. addWork = 0
  155. }
  156. }
  157. }
  158. return nil
  159. })
  160. if err != nil {
  161. panic(err)
  162. }
  163. }
  164. /* delete unused hash values from the hash file */
  165. func pruneHash(dir string) {
  166. _, err := os.Stat(dir + "/.fdhashes3")
  167. if !os.IsNotExist(err) {
  168. hashFile, err := hash.LoadHashfile(dir + "/.fdhashes3")
  169. check(err)
  170. for filename := range hashFile.Hashes {
  171. _, err := os.Stat(dir + "/" + filename)
  172. if os.IsNotExist(err) {
  173. delete(hashFile.Hashes, filename)
  174. delete(hashFile.Times, filename)
  175. hashFile.Dirty = true
  176. }
  177. }
  178. for filename := range hashFile.Times {
  179. _, err := os.Stat(dir + "/" + filename)
  180. if os.IsNotExist(err) {
  181. delete(hashFile.Hashes, filename)
  182. delete(hashFile.Times, filename)
  183. hashFile.Dirty = true
  184. }
  185. }
  186. saveHashfile(hashFile)
  187. }
  188. }
  189. func dirtyHashfile(hashFile *hash.Fdhashes) {
  190. hashFile.Dirty = true
  191. }
  192. func saveAllHashFiles() {
  193. hashList := make([]hash.Fdhashes, 0)
  194. for _, hashFile := range hashes {
  195. if hashFile.Dirty {
  196. saveHashfile(&hashFile)
  197. hashList = append(hashList, hashFile)
  198. }
  199. }
  200. hashes = make(map[string]hash.Fdhashes)
  201. for _, hashFile := range hashList {
  202. hashes[hashFile.Path] = hashFile
  203. }
  204. }
  205. func saveHashfile(hashFile *hash.Fdhashes) {
  206. if hashFile.Dirty {
  207. hashFile.Dirty = false
  208. b, err := json.Marshal(hashFile)
  209. if err != nil {
  210. fmt.Println(err)
  211. return
  212. }
  213. err = ioutil.WriteFile(hashFile.Path+"/.fdhashes3", b, 0644)
  214. if err != nil {
  215. panic(err)
  216. }
  217. }
  218. }
  219. func compareFolder(folder string) {
  220. loadIgnoreFile(ignores)
  221. loadAllHashFiles(folder)
  222. // putting all hashes into one big map key = hash, value list of files with that hash
  223. size := len(hashes)
  224. index := make(map[string][]string)
  225. count = 0
  226. for _, hashFile := range hashes {
  227. count++
  228. if count%100 == 0 {
  229. fmt.Printf("%d (%d) merging\n", count, size)
  230. }
  231. for filename, hash := range hashFile.Hashes {
  232. values := index[hash]
  233. if values == nil {
  234. values = make([]string, 0)
  235. }
  236. filepath := fmt.Sprintf("%s/%s", hashFile.Path, filename)
  237. pos := sort.SearchStrings(ignoreLines, filepath)
  238. if pos == len(ignoreLines) {
  239. _, err := os.Stat(filepath)
  240. if err == nil {
  241. values = append(values, filepath)
  242. index[hash] = values
  243. }
  244. }
  245. }
  246. }
  247. // sorting list of files for every hash and deleting hashes with only 1 entry
  248. size = len(index)
  249. myHashes := make([]string, 0)
  250. count = 0
  251. for hash, values := range index {
  252. count++
  253. if count%100 == 0 {
  254. fmt.Printf("%d (%d) sorting\n", count, size)
  255. }
  256. if len(values) > 1 {
  257. sort.Strings(values)
  258. index[hash] = values
  259. myHashes = append(myHashes, hash)
  260. // for _, filename := range values {
  261. // fmt.Printf(" %s\n", filename)
  262. // }
  263. } else {
  264. delete(index, hash)
  265. }
  266. }
  267. sort.Slice(myHashes, func(i, j int) bool { return index[myHashes[i]][0] < index[myHashes[j]][0] })
  268. if outputJson {
  269. size = len(index)
  270. var filesize int64
  271. fileCount := 0
  272. for _, hash := range myHashes {
  273. values := index[hash]
  274. count++
  275. if count%100 == 0 {
  276. fmt.Printf("%d (%d) checking\n", count, size)
  277. }
  278. if len(values) > 1 {
  279. info, err := os.Stat(values[0])
  280. if err == nil {
  281. fmt.Printf("found identically hash: %s size: %d\n", hash, info.Size())
  282. filesize += int64(len(values)-1) * info.Size()
  283. }
  284. fileCount += len(values) - 1
  285. for _, filename := range values {
  286. fmt.Printf(" %s\n", filename)
  287. }
  288. } else {
  289. delete(index, hash)
  290. }
  291. }
  292. b, err := json.Marshal(index)
  293. if err != nil {
  294. fmt.Println(err)
  295. return
  296. }
  297. err = ioutil.WriteFile(report, b, 0644)
  298. if err != nil {
  299. panic(err)
  300. }
  301. } else {
  302. size := len(index)
  303. f, err := os.Create(report)
  304. check(err)
  305. w := bufio.NewWriter(f)
  306. count := 0
  307. var filesize int64
  308. fileCount := 0
  309. for _, hash := range myHashes {
  310. values := index[hash]
  311. count++
  312. if count%100 == 0 {
  313. fmt.Printf("%d (%d) checking\n", count, size)
  314. }
  315. if len(values) > 1 {
  316. info, err := os.Stat(values[0])
  317. if err == nil {
  318. w.WriteString(fmt.Sprintf("found identically hash: size: %d\n", info.Size()))
  319. filesize += int64(len(values)-1) * info.Size()
  320. }
  321. fileCount += len(values) - 1
  322. for _, filename := range values {
  323. w.WriteString(fmt.Sprintf(" %s\n", filename))
  324. }
  325. w.Flush()
  326. }
  327. }
  328. w.WriteString(fmt.Sprintf("can save up to %s on %d files\n", bytefmt.ByteSize(uint64(filesize)), fileCount))
  329. w.Flush()
  330. }
  331. }
  332. func loadIgnoreFile(filename string) {
  333. content, err := ioutil.ReadFile(filename)
  334. if err == nil {
  335. ignoreLines = strings.Split(string(content), "\n")
  336. sort.Strings(ignoreLines)
  337. }
  338. }
  339. func search(srcHash string, exFilename string, exFilepath string) (value string, found bool) {
  340. for _, hashFile := range hashes {
  341. for filename, hash := range hashFile.Hashes {
  342. if (filename != exFilename) && (hashFile.Path != exFilepath) {
  343. if hash == srcHash {
  344. value += fmt.Sprintf("%s/%s;", hashFile.Path, filename)
  345. found = true
  346. }
  347. }
  348. }
  349. }
  350. return
  351. }
  352. func loadAllHashFiles(folder string) {
  353. count = 0
  354. addWork = 0
  355. err := filepath.Walk(folder, func(path string, info os.FileInfo, err error) error {
  356. if info != nil {
  357. if info.IsDir() {
  358. count++
  359. fmt.Print(".")
  360. if (count % 100) == 0 {
  361. fmt.Println()
  362. }
  363. hashFile, ok := hashes[path]
  364. if !ok {
  365. _, err := os.Stat(path + "/.fdhashes3")
  366. if os.IsNotExist(err) {
  367. hashFile = hash.Fdhashes{Path: path, Hashes: make(map[string]string), Times: make(map[string]time.Time), Dirty: true}
  368. } else {
  369. hf, err := hash.LoadHashfile(path + "/.fdhashes3")
  370. check(err)
  371. hashFile = *hf
  372. }
  373. hashes[path] = hashFile
  374. }
  375. }
  376. }
  377. return nil
  378. })
  379. check(err)
  380. fmt.Printf("\nfound %d hash files.\n", len(hashes))
  381. }
  382. func check(e error) {
  383. if e != nil {
  384. panic(e)
  385. }
  386. }