# pyblosxom2leonardo.py # Copyright (c) 2005 Tim Wegener # # Permission is hereby granted, free of charge, to any person obtaining # a copy of this software and associated documentation files (the # "Software"), to deal in the Software without restriction, including # without limitation the rights to use, copy, modify, merge, publish, # distribute, sublicense, and/or sell copies of the Software, and to # permit persons to whom the Software is furnished to do so, subject # to the following conditions: # # The above copyright notice and this permission notice shall be # included in all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # """Convert a PyBlosxom blog to Leonardo LFS format. This almost works, but needs to update ldb/property_index Usage: python pyblosxom2leonardo.py This requires Python > 2.3 (for os.walk). If you want a backport, let me know. This script is available under the MIT License. """ __author__ = 'Tim Wegener http://www.madabar.com' import re import os import sys import time import shelve import fnmatch import xml.dom.minidom def get_element_text(dom, name): """Return the node text for a DOM element.""" element = dom.getElementsByTagName(name)[0] if not element or element.firstChild is None: result = '' else: result = element.firstChild.nodeValue return result def pyblosxom2leonardo(old_blog_dir, old_comment_dir, lfs_blog_dir, extension='.txt', use_last=True, quiet=False): """Convert a pyblosxom blog data store to a Leonardo LFS datastore. Arguments: old_blog_dir -- pyblosxom blog data directory old_comment_dir -- pyblosxom comment data directory lfs_blog_dir -- blog subdirectory underneath lfs directory extension -- extension of blog entry filenames (default: '.txt') use_last -- only use deepest subdirectory of blog entry for category otherwise, use each directory component as a category (default: False) quiet -- if set don't output informative messages (default: False) """ # Read PyBlosxom tree. base_len = len(old_blog_dir.split(os.sep)) # todo: local or gmt? time2tuple = time.localtime for dirpath, dirnames, filenames in os.walk(old_blog_dir): dir_parts = dirpath.split(os.sep)[base_len:] # Get comment directory. comment_dir = os.path.join(old_comment_dir, *dir_parts) # Grab categories from dirname. categories = dir_parts if use_last: categories = categories[-1:] entry_filenames = fnmatch.filter(filenames, '*%s' % extension) for filename in entry_filenames: # Read old PyBlosxom data. # Grab blog filename for LFS dirname. path = os.path.join(dirpath, filename) if not quiet: print 'entry filename:', path # Grab timestamp from blog file. mtime = os.path.getmtime(path) ctime = os.path.getctime(path) # Grab title from first line of blog file. f = open(path, 'r') page_title = f.readline().strip() # Grab content from the rest of the file. body = f.read() f.close() # Write new Leonardo LFS data. # Create LFS dirs based on date. # Note that month and day are zero padded to two digits! time_tuple = time2tuple(ctime) year = '%04d' % time_tuple[0] month = '%02d' % time_tuple[1] day = '%02d' % time_tuple[2] lfs_dir_date_part = os.path.join(*(year, month, day)) lfs_date_dir = os.path.join(lfs_blog_dir, lfs_dir_date_part) # Create LFS .ldv subdir based on blog file name. entry_name = os.path.splitext(filename)[0] ldv_dir = os.path.join(lfs_date_dir, '%s.ldv' % entry_name) os.makedirs(ldv_dir) # Create property_db property_db = shelve.open(os.path.join(ldv_dir, 'property_db')) property_db['page_title'] = page_title property_db['categories'] = categories property_db['allow_comments'] = 'YES' property_db['allow_trackbacks'] = 'YES' property_db['creation_time'] = ctime property_db['last_modified'] = mtime property_db.close() # Create __content__.xhtml file from blog file body. ldv_content_filename = os.path.join(ldv_dir, '__content__.xhtml') f_out = open(ldv_content_filename, 'w') f_out.write(body) f_out.close() # Comments if not os.path.exists(comment_dir): continue comment_files = os.listdir(comment_dir) comments = [] for filename in comment_files: comment_match = re.match(r'%s-([\d\.]+).cmt' % entry_name, filename) if not comment_match: continue comment_path = os.path.join(comment_dir, filename) if not quiet: print 'comment filename:', comment_path timestamp = float(comment_match.group(1)) # Parse the comment XML data. # Note: The element is ignored. comment = {} dom = xml.dom.minidom.parse(open(comment_path)) comment['comment_title'] = get_element_text(dom, 'title') comment['author_name'] = get_element_text(dom, 'author') comment['author_link'] = get_element_text(dom, 'link') comment['creation_time'] = float( get_element_text(dom, 'pubDate')) comment['last_modified'] = os.path.getmtime(comment_path) body = get_element_text(dom, 'description') comments.append((comment, body)) # Write out the LDV dat. # (Need to sort on creation time.) def cmp_creation_time(a, b): return cmp(a[0]['creation_time'], b[0]['creation_time']) comments.sort(cmp_creation_time) comment_i = 0 # Leonardo comments start at 1. for comment, body in comments: comment_i += 1 # Create comment directory. comment_ldv_dir = os.path.join(ldv_dir, '__comment__%d' % comment_i) os.mkdir(comment_ldv_dir) # Create content. # todo: It would be nice if Leonardo support xhtml comments. # For now, convert tags to text. body = body.replace('&', '&') body = body.replace('"', "'") # Expose links as text. ## body = re.sub(r']*?href="([^"]*?)"[^>]*?>([^<]*?)', ## r'\2 [ \1 ]', ## body, ## re.MULTILINE|re.DOTALL) body = re.sub(r'', '\n', body, re.MULTILINE|re.DOTALL) # Remove any unsupported tags. body = re.sub(r'<[^>]*?>', '', body, re.MULTILINE|re.DOTALL) body = re.sub(r'&\w{3,4};', '', body, re.MULTILINE|re.DOTALL) # Write out content. f = open(os.path.join(comment_ldv_dir, '__content__.txt'), 'w') f.write(body) f.close() # Create property_db. property_db = shelve.open(os.path.join(comment_ldv_dir, 'property_db')) property_db.update(comment) property_db.close() def main(): # Usage if len(sys.argv) != 4: sys.stdout.write(__doc__) sys.exit(2) # Get PyBlosxom blog data directory. old_blog_dir = sys.argv[1] old_comment_dir = sys.argv[2] # Get Leonardo blog data directory. lfs_blog_dir = sys.argv[3] pyblosxom2leonardo(old_blog_dir, old_comment_dir, lfs_blog_dir) if __name__ == '__main__': main()