main.go 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. package main
  2. import (
  3. "bufio"
  4. "encoding/json"
  5. "fmt"
  6. "io/ioutil"
  7. "log"
  8. "os"
  9. "path/filepath"
  10. "slices"
  11. "sort"
  12. "strings"
  13. "sync"
  14. "time"
  15. "code.cloudfoundry.org/bytefmt"
  16. flag "github.com/spf13/pflag"
  17. "wkla.no-ip.biz/gogs/Willie/GoHash/pkg/hash"
  18. )
  19. var hashes map[string]hash.Fdhashes
  20. var ignoreLines []string
  21. var mu sync.RWMutex
  22. var rewrite bool
  23. var prune bool
  24. var outputJson bool
  25. var report string
  26. var ignores string
  27. func init() {
  28. flag.BoolVarP(&rewrite, "rewrite", "r", false, "rewrite all fhhashes files.")
  29. flag.StringVarP(&report, "equals", "e", "", "compare all file hashes and writing a equlatity report.")
  30. flag.BoolVarP(&prune, "prune", "p", false, "checking all fdhashes files.")
  31. flag.BoolVarP(&outputJson, "json", "j", false, "output as json.")
  32. flag.StringVarP(&ignores, "ignores", "i", "", "list of files to ignore in report.")
  33. }
  34. func main() {
  35. log.Println("starting GoHash")
  36. ignoreLines = make([]string, 0)
  37. hashes = make(map[string]hash.Fdhashes)
  38. flag.Parse()
  39. myFile := flag.Arg(0)
  40. if rewrite {
  41. log.Println("rewrite active")
  42. }
  43. if prune {
  44. log.Println("prune active")
  45. }
  46. if outputJson {
  47. log.Println("output json format active")
  48. }
  49. if report != "" {
  50. log.Println("report active, file: ", report)
  51. }
  52. if ignores != "" {
  53. log.Println("ignores file: ", ignores)
  54. }
  55. file, err := os.Stat(myFile)
  56. if os.IsNotExist(err) {
  57. log.Fatalln("File does not exists:", myFile)
  58. }
  59. if file.IsDir() {
  60. log.Println("start with folder:", myFile)
  61. if report != "" {
  62. compareFolder(myFile)
  63. } else {
  64. processFolder(myFile)
  65. saveAllHashFiles()
  66. }
  67. } else {
  68. log.Printf("file %s has hash %s\n", myFile, hash.GetFileHash(myFile))
  69. }
  70. log.Println("done")
  71. }
  72. var lock1 = sync.RWMutex{}
  73. var lock2 = sync.RWMutex{}
  74. func calculateHash(fileStr string) {
  75. var hashFile hash.Fdhashes
  76. doHash := true
  77. dir, fileName := filepath.Split(fileStr)
  78. if fileName == ".fdhashes3" {
  79. return
  80. }
  81. // checking if hash is present
  82. mu.Lock()
  83. hashFile, ok := hashes[dir]
  84. if !ok {
  85. _, err := os.Stat(dir + ".fdhashes3")
  86. if os.IsNotExist(err) || rewrite {
  87. hashFile = hash.Fdhashes{Path: dir, Hashes: make(map[string]string), Times: make(map[string]time.Time), Dirty: true}
  88. } else {
  89. hf, err := hash.LoadHashfile(dir + ".fdhashes3")
  90. check(err)
  91. hashFile = *hf
  92. }
  93. hashes[dir] = hashFile
  94. }
  95. lock1.RLock()
  96. _, ok = hashFile.Hashes[fileName]
  97. lock1.RUnlock()
  98. mu.Unlock()
  99. doHash = !ok
  100. // checking if dattime is identically
  101. file, _ := os.Stat(fileStr)
  102. time := file.ModTime()
  103. lock2.RLock()
  104. savedTime, ok := hashFile.Times[fileName]
  105. lock2.RUnlock()
  106. if !time.Equal(savedTime) || !ok {
  107. doHash = true
  108. }
  109. if doHash {
  110. log.Printf("starting %s\n", fileStr)
  111. hash := hash.GetFileHash(fileStr)
  112. log.Printf("ready %s\n", fileStr)
  113. mu.Lock()
  114. lock1.Lock()
  115. hashFile.Hashes[fileName] = hash
  116. lock1.Unlock()
  117. lock2.Lock()
  118. hashFile.Times[fileName] = time
  119. lock2.Unlock()
  120. dirtyHashfile(&hashFile)
  121. hashes[dir] = hashFile
  122. mu.Unlock()
  123. log.Printf("file \"%s\" has hash \"%s\"\n", fileStr, hash)
  124. }
  125. }
  126. var count int
  127. var addWork int
  128. var startTime time.Time
  129. func processFolder(folder string) {
  130. startTime = time.Now()
  131. count = 0
  132. addWork = 0
  133. err := filepath.Walk(folder, func(path string, info os.FileInfo, err error) error {
  134. count++
  135. if (count % 100) == 0 {
  136. fmt.Print(".")
  137. }
  138. if (count % 10000) == 0 {
  139. fmt.Println()
  140. }
  141. filename := info.Name()
  142. if filename[0:1] != "." {
  143. if info.IsDir() {
  144. fmt.Println(path)
  145. if prune {
  146. pruneHash(path)
  147. }
  148. }
  149. if !info.IsDir() {
  150. addWork++
  151. calculateHash(path)
  152. if time.Since(startTime).Seconds() > 10.0 {
  153. startTime = time.Now()
  154. saveAllHashFiles()
  155. addWork = 0
  156. }
  157. }
  158. }
  159. return nil
  160. })
  161. if err != nil {
  162. panic(err)
  163. }
  164. }
  165. /* delete unused hash values from the hash file */
  166. func pruneHash(dir string) {
  167. _, err := os.Stat(dir + "/.fdhashes3")
  168. if !os.IsNotExist(err) {
  169. hashFile, err := hash.LoadHashfile(dir + "/.fdhashes3")
  170. check(err)
  171. for filename := range hashFile.Hashes {
  172. _, err := os.Stat(dir + "/" + filename)
  173. if os.IsNotExist(err) {
  174. delete(hashFile.Hashes, filename)
  175. delete(hashFile.Times, filename)
  176. hashFile.Dirty = true
  177. }
  178. }
  179. for filename := range hashFile.Times {
  180. _, err := os.Stat(dir + "/" + filename)
  181. if os.IsNotExist(err) {
  182. delete(hashFile.Hashes, filename)
  183. delete(hashFile.Times, filename)
  184. hashFile.Dirty = true
  185. }
  186. }
  187. saveHashfile(hashFile)
  188. }
  189. }
  190. func dirtyHashfile(hashFile *hash.Fdhashes) {
  191. hashFile.Dirty = true
  192. }
  193. func saveAllHashFiles() {
  194. hashList := make([]hash.Fdhashes, 0)
  195. for _, hashFile := range hashes {
  196. if hashFile.Dirty {
  197. saveHashfile(&hashFile)
  198. hashList = append(hashList, hashFile)
  199. }
  200. }
  201. hashes = make(map[string]hash.Fdhashes)
  202. for _, hashFile := range hashList {
  203. hashes[hashFile.Path] = hashFile
  204. }
  205. }
  206. func saveHashfile(hashFile *hash.Fdhashes) {
  207. if hashFile.Dirty {
  208. hashFile.Dirty = false
  209. b, err := json.Marshal(hashFile)
  210. if err != nil {
  211. fmt.Println(err)
  212. return
  213. }
  214. err = ioutil.WriteFile(hashFile.Path+"/.fdhashes3", b, 0644)
  215. if err != nil {
  216. panic(err)
  217. }
  218. }
  219. }
  220. func compareFolder(folder string) {
  221. loadIgnoreFile(ignores)
  222. loadAllHashFiles(folder)
  223. // putting all hashes into one big map key = hash, value list of files with that hash
  224. size := len(hashes)
  225. index := make(map[string][]string)
  226. count = 0
  227. for _, hashFile := range hashes {
  228. count++
  229. if count%100 == 0 {
  230. fmt.Printf("%d (%d) merging\n", count, size)
  231. }
  232. for filename, hash := range hashFile.Hashes {
  233. values := index[hash]
  234. if values == nil {
  235. values = make([]string, 0)
  236. }
  237. filepath := fmt.Sprintf("%s/%s", hashFile.Path, filename)
  238. if !contains(filepath) {
  239. _, err := os.Stat(filepath)
  240. if err == nil {
  241. values = append(values, filepath)
  242. index[hash] = values
  243. }
  244. }
  245. }
  246. }
  247. // sorting list of files for every hash and deleting hashes with only 1 entry
  248. size = len(index)
  249. myHashes := make([]string, 0)
  250. count = 0
  251. for hash, values := range index {
  252. count++
  253. if count%100 == 0 {
  254. fmt.Printf("%d (%d) sorting\n", count, size)
  255. }
  256. if len(values) > 1 {
  257. sort.Strings(values)
  258. index[hash] = values
  259. myHashes = append(myHashes, hash)
  260. // for _, filename := range values {
  261. // fmt.Printf(" %s\n", filename)
  262. // }
  263. } else {
  264. delete(index, hash)
  265. }
  266. }
  267. sort.Slice(myHashes, func(i, j int) bool { return index[myHashes[i]][0] < index[myHashes[j]][0] })
  268. if outputJson {
  269. size = len(index)
  270. var filesize int64
  271. fileCount := 0
  272. for _, hash := range myHashes {
  273. values := index[hash]
  274. count++
  275. if count%100 == 0 {
  276. fmt.Printf("%d (%d) checking\n", count, size)
  277. }
  278. if len(values) > 1 {
  279. info, err := os.Stat(values[0])
  280. if err == nil {
  281. fmt.Printf("found identically hash: %s size: %d\n", hash, info.Size())
  282. filesize += int64(len(values)-1) * info.Size()
  283. }
  284. fileCount += len(values) - 1
  285. for _, filename := range values {
  286. fmt.Printf(" %s\n", filename)
  287. }
  288. } else {
  289. delete(index, hash)
  290. }
  291. }
  292. b, err := json.Marshal(index)
  293. if err != nil {
  294. fmt.Println(err)
  295. return
  296. }
  297. err = os.WriteFile(report, b, 0644)
  298. if err != nil {
  299. panic(err)
  300. }
  301. } else {
  302. size := len(index)
  303. f, err := os.Create(report)
  304. check(err)
  305. w := bufio.NewWriter(f)
  306. count := 0
  307. var filesize int64
  308. fileCount := 0
  309. for _, hash := range myHashes {
  310. values := index[hash]
  311. count++
  312. if count%100 == 0 {
  313. fmt.Printf("%d (%d) checking\n", count, size)
  314. }
  315. if len(values) > 1 {
  316. info, err := os.Stat(values[0])
  317. if err == nil {
  318. w.WriteString(fmt.Sprintf("found identically hash: size: %d\n", info.Size()))
  319. filesize += int64(len(values)-1) * info.Size()
  320. }
  321. fileCount += len(values) - 1
  322. for _, filename := range values {
  323. w.WriteString(fmt.Sprintf(" %s\n", filename))
  324. }
  325. w.Flush()
  326. }
  327. }
  328. w.WriteString(fmt.Sprintf("can save up to %s on %d files\n", bytefmt.ByteSize(uint64(filesize)), fileCount))
  329. w.Flush()
  330. }
  331. }
  332. func contains(filepath string) bool {
  333. return slices.Contains(ignoreLines, strings.ToLower(filepath))
  334. }
  335. func loadIgnoreFile(filename string) {
  336. content, err := os.ReadFile(filename)
  337. if err == nil {
  338. lines := strings.Split(string(content), "\n")
  339. ignoreLines = make([]string, 0)
  340. for _, line := range lines {
  341. line = strings.TrimSpace(line)
  342. line = strings.ToLower(line)
  343. line, _ = strings.CutSuffix(line, "\r")
  344. if line != "" {
  345. ignoreLines = append(ignoreLines, line)
  346. }
  347. }
  348. slices.Sort(ignoreLines)
  349. }
  350. }
  351. func search(srcHash string, exFilename string, exFilepath string) (value string, found bool) {
  352. for _, hashFile := range hashes {
  353. for filename, hash := range hashFile.Hashes {
  354. if (filename != exFilename) && (hashFile.Path != exFilepath) {
  355. if hash == srcHash {
  356. value += fmt.Sprintf("%s/%s;", hashFile.Path, filename)
  357. found = true
  358. }
  359. }
  360. }
  361. }
  362. return
  363. }
  364. func loadAllHashFiles(folder string) {
  365. count = 0
  366. addWork = 0
  367. err := filepath.Walk(folder, func(path string, info os.FileInfo, err error) error {
  368. if info != nil {
  369. if info.IsDir() {
  370. count++
  371. fmt.Print(".")
  372. if (count % 100) == 0 {
  373. fmt.Println()
  374. }
  375. hashFile, ok := hashes[path]
  376. if !ok {
  377. _, err := os.Stat(path + "/.fdhashes3")
  378. if os.IsNotExist(err) {
  379. hashFile = hash.Fdhashes{Path: path, Hashes: make(map[string]string), Times: make(map[string]time.Time), Dirty: true}
  380. } else {
  381. hf, err := hash.LoadHashfile(path + "/.fdhashes3")
  382. check(err)
  383. hashFile = *hf
  384. }
  385. hashes[path] = hashFile
  386. }
  387. }
  388. }
  389. return nil
  390. })
  391. check(err)
  392. fmt.Printf("\nfound %d hash files.\n", len(hashes))
  393. }
  394. func check(e error) {
  395. if e != nil {
  396. panic(e)
  397. }
  398. }