#! /usr/bin/env python import sys, os, stat, string, formatter, time def cleanhtmls(str, index): # print "CLEAN %d %s"%(len(str), str[index:-1]) qchar=str[index+8] # print "qchar = "+qchar u = str[index+9:-1] # print "u = " + u next = string.find(u,qchar) if next < 0: return u url = u[0:next] rest = str[index+next+10:-1] # print "URL = "+url # print "rest = " + rest S = u if string.find(url,"http") < 0: pound = string.find(url,"#") if pound > 0: # print "FOUND POUND "+url+" REST "+rest rest = url[pound:-1] + rest url = url[0:pound] # print "ROUND POUND "+url+" REST "+rest ext = os.path.splitext(url)[1] if ext == '.html': url = os.path.splitext(url)[0] + '.htm' S = url.lower() + qchar + rest nextindex = string.find(S.lower(), " %s" % (filename, dir) html2htm(os.path.join(dir,filename), os.path.join(dir,'tmp_tmp')) os.unlink(os.path.join(dir,filename)) filename = filename.lower() os.rename(os.path.join(dir,'tmp_tmp'), os.path.join(dir,filename)) elif ext == '.html': newname = os.path.splitext(filename)[0]+'.htm' newname = newname.lower() print " HTML with .html extension " + filename print " cleaning to " + newname html2htm(os.path.join(dir,filename), os.path.join(dir,newname)) os.unlink(os.path.join(dir,filename)) # print "(HTML) %12s <> %s" % (newname, dir) # main *************************************************************** ###################################################################### def main(): inputdirs = sys.argv[1:] print time.strftime("html 2 htm = %a, %d %b %Y %H:%M:%S +0000", time.gmtime(time.time())) print inputdirs for dir in inputdirs: os.path.walk(dir,parseHTMLFiles,0) if __name__ == '__main__': main()