7.04.2015 г.

Python search for duplicated files


##################################################
#   Search for duplicated files without,                                         #
#   meaningless file exstension                                                     #
#     NOT EDIT BY HAND!!!                                                   #
##################################################

#IMPORT MODULES FOR SCRIPT

import hashlib, os
from itertools import *
from operator import itemgetter

#THIS USE FOR GIVE MD5 FOR FILE'S

md = hashlib.md5()
#Directory = os.getcwd()
#ADD NEEDED LIST
Dir = "C:\\SCRIPT"
_File = []       #CONTAINED PATH FOR FILE
_File_H = []     #CONTAINED MD5 CODE FOR FILE
_duplicated = []   #CONTAINED DUPLIATED
_duplicated_sravnqva = [] #NEED FOR ZIP
subdirlist = []    #MAY DELETED IF YOU WON USED FOR FUNCTION search_file

#SEARCH WITH OS.LISTDIR NOT WORKING FOR MOMENT
# def search_file(Dir):

#  for file in os.listdir(Dir):
#   if os.path.isfile(file):
#    yield os.path.join(Dir,file)
#   else:
#    subdirlist.append(os.path.join(Dir, file))
#    try:
#     for subdir in subdirlist:
#      search_file(subdir)
#    except:
#     print "Somting wrong"

#SEARCH WITH OS.WALK

def search_os_walk(Dir):
    for path,dirlist,filelist in os.walk(Dir):
        for fn in filelist:
            yield os.path.join(path,fn)

#FUNCTION USE FOR ENCRIPT FILE TO MD5
def md_5(filePath):
    with open(filePath, 'rb') as fh:
        m = hashlib.md5()
        while True:
            data = fh.read(8192)
            if not data:
                break
            m.update(data)
        return m.hexdigest()

#SEARCH FOR FILE AND APPEND THEM TO REQUIRED LIST (LOOKED ABOVE FOR #DESCRIPTION'S )

for file in search_os_walk(Dir):
     _File.append(file)
     _File_H.append(md_5(file))

for f,z in (izip(_File, _File_H)):
    if z in _duplicated:
         _duplicated_sravnqva.append(z)
         _duplicated.append(z)

print "*"*80
_Dupliated =  [x[0] for x in zip(_File, _File_H) if x[1] in _duplicated_sravnqva]
one= []
two= []
for i in _Dupliated:
    one.append(i)
    two.append(md_5(i))

#RESULT IN DICTIONARI, NEEDED FOR FUNCTION GROUPBY

d = dict(zip(one,two))
di = sorted(d.iteritems(), key=itemgetter(1))
for k, g in groupby(di, key=itemgetter(1)):
#IF YOU WHANT TO SHOW IN COLUMN UNCHECK BOTTOM ROW'S
    # for i,z  in enumerate(map(itemgetter(0), g)):
    #  print i,z
    print "Duplicated ", map(itemgetter(0), g)
raw_input()

Няма коментари :

Публикуване на коментар