#! /usr/bin/env python import sys, os, stat, string, htmllib, formatter, time # Routines to make dictoinary **************************************** ###################################################################### jpegdict = {} def testDuplicates(filename, dir1, dir2): # print "DEBUG %12s <> %s %s" % (filename, dir1, dir2) file1 = os.path.join(dir1,filename) size1 = 0 # NAif os.path.isfile(size1): try: fstat = os.stat(file1) size1 = fstat[stat.ST_SIZE]/1000 # add in the file's size except os.error, exc: sys.stderr.write("stat error %s:%s\n" % (file1, exc)) file2 = os.path.join(dir2,filename) size2 = 0 # NAif os.path.isfile(size2): try: fstat = os.stat(file2) size2 = fstat[stat.ST_SIZE]/1000 # add in the file's size except os.error, exc: sys.stderr.write("stat error %s:%s\n" % (file2, exc)) if size1 == size2: print "%12s DUP! NAME & SIZE %d KB <> %s AND %s" % (filename, size1, file1, file2) else: print "%12s SAME NAME <> %s %d KB AND %s %d KB" % (filename, file1, size1, file2, size2) def isThumb(path): dir = path while len(dir) > 4: # print "isThumb " + dir if os.path.split(dir)[1] == 'thumbs': return 'yes' dir = os.path.split(dir)[0] # print "isThumb NOT "+path return None def isInteresting(ext): if ext == '.jpg' or ext == '.gif' or ext == '.png' or ext == '.htm' or ext == '.html' or ext == '.txt' or ext == '.pdf' or ext == '.mov' or ext == '.wrl' or ext == '.doc': return ext return None def skipDir(dirname): if dirname == 'thumbs': return 1 if dirname == 'code': return 1 if dirname == 'vrml': return 1 if dirname == 'private': return 1 if len(dirname) == 0: return 1 return None def dictifyDir(arg, dir, names): randoms = 0 things = 0 subdirs = 0 dirname = os.path.split(dir)[1] if skipDir(dirname): return if dirname[0] == '.': print "found XVPICS "+os.path.abspath(dir) return for filename in names: ext = os.path.splitext(filename)[1] if isInteresting(ext): # print "%12s <> %s/%s" % (filename, dir, filename) things = things+1 if jpegdict.has_key(filename): if type(jpegdict[filename]) == type([]): print "%12s ADDing to list %s" % (filename, dir) jpegdict[filename].append(dir) else: testDuplicates(filename, dir, jpegdict[filename]) jpegdict[filename] = [ jpegdict[filename], dir ] else: jpegdict[filename] = dir elif os.path.isdir(os.path.join(dir,filename)): subdirs = subdirs+1 else: randoms = randoms+1 print "(DIR) %s has %d files, %d subdirs and %d randoms" % (dir, things, subdirs, randoms) # Routines to parse HTML ********************************************* ###################################################################### def markFileUsed(path,htmlfile): filename = os.path.split(path)[1] dir = os.path.split(path)[0] if jpegdict.has_key(filename): if type(jpegdict[filename]) == type([]): # print "%12s ADDing to list %s" % (filename, dir) jpegdict[filename].append(htmlfile) else: jpegdict[filename] = [ jpegdict[filename], htmlfile ] else: print "FILE NONEXISTENT %s in %s "%(path,htmlfile) jpegdict[filename] = htmlfile class myhtml(htmllib.HTMLParser): def handle_image(self, image, alt, ismap, align, width, height): # print "image: " + os.path.join(self.dir,image) if isThumb(image): return markFileUsed(os.path.join(self.dir,image),self.htm) def anchor_bgn(self, href, name, type): if string.find(href,"http:") >= 0 or string.find(href,"mailto:") >= 0 or string.find(href,"news:") >= 0: return pound = string.find(href,"#") if pound == 0: return if pound > 0: href = href[0:pound] # print "ROUND POUND "+url+" REST "+rest # R E L A T I V E P A T H N A M E S #print "href: " + os.path.join(self.dir,href) markFileUsed(os.path.join(self.dir,href),self.htm) def setCurrentHTML(self,dir,filename): self.dir = dir self.htm = 'HTML:'+os.path.join(self.dir,filename) def parseHTMLFiles(arg, dir, names): dirname = os.path.split(dir)[1] if skipDir(dirname): return if dirname[0] == '.': print "found XVPICS "+os.path.abspath(dir) return for filename in names: ext = os.path.splitext(filename)[1] if ext == '.htm': print "(HTML) %12s <> %s" % (filename, dir) parser = myhtml(formatter.NullFormatter()) parser.setCurrentHTML(dir,filename) parser.feed(open(os.path.join(dir,filename)).read()) elif ext == '.html': print " HTML with .html extension " + filename # main *************************************************************** ###################################################################### def main(): inputdirs = sys.argv[1:] print time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime(time.time())) print inputdirs print "*******************************************" for dir in inputdirs: os.path.walk(dir,dictifyDir,0) print "\n ************ found %d content files ***** \n"%(len(jpegdict)) for dir in inputdirs: os.path.walk(dir,parseHTMLFiles,0) print "\n ************ referenced %d files ************ \n"%(len(jpegdict)) sorted = [] for image in jpegdict.keys(): sorted.append(image) sorted.sort() for image in sorted: if type(jpegdict[image]) == type([]): print "%12s <> %s" % (image, jpegdict[image]) else: print "%12s %s" % (image, jpegdict[image]) if __name__ == '__main__': main()