Python - Recursive Directory Crawl Using Generators

Published on November 06, 2007

I was looking for an os.walk example to crawl through a file system and found the locate function below on ActiveState's Python Cookbook site. I incorporated it into a simple routine that dumps the output to an XML file that can then be transformed using XSLT to sort and tally the results.

#!/usr/bin/env python

import os
import fnmatch
import time
from xml.dom import minidom

def locate(pattern, root=os.curdir):
    for path, dirs, files in os.walk(os.path.abspath(root)):
        for filename in fnmatch.filter(files, pattern):
            yield os.path.join(path, filename)
 
def main():
    doc = minidom.Document()
    files = doc.createElement("files")
    doc.appendChild(files)
    comment = doc.createComment("Size attribute is reported in bytes.")
    files.appendChild(comment)
    
    for i, file in enumerate(locate("*.*", "\\\\SERVER\\Share")):
        try:
            item = doc.createElement("filename")
            item.setAttribute("id", "%s" % (i))
            item.setAttribute("path", file)
            item.setAttribute("ext", os.path.splitext(file)[1].lower())
            item.setAttribute("size", "%s" % os.stat(file).st_size)
            item.setAttribute("last_modified", time.ctime(os.stat(file).st_mtime))
            files.appendChild(item)
        except OSError, e:
            print "%s => %s" % (file, e.strerror)

    fp = open('myfiles.xml', 'w')
    doc.writexml(fp, "", "  ", "\n", "utf-8")
    fp.close()
    
    return    

if __name__ == "__main__":
    main()

The locate function takes two parameters; the first is a file pattern to match and the second is the directory to start the crawl from.

Comments are closed.

Comments have been closed for this post.