#!/usr/bin/env python """html2css. Turn a HTML resource (url, file or string) into the corresponding CSS. Visit http://code.google.com/p/html2css/ to view the newest version. Requires the BeautifulSoup library (version 3.0.3), which comes bundled. """ __author__ = 'Jonathan Holst (holst.biz)' __version__ = '0.2' __date__ = '2007-09-01' __license__ = 'lGPL' from beautifulsoup import BeautifulSoup import sys, getopt, urllib, tools class html2css(object): config = { 'tabsize': 1 # Number of spaces, or 'tab' if \t } nonCssTags = (u'head',) def __init__(self, source): self.source = source self.looseRules = [] self.strictRules = [] self.__parse() def display(self): _return = '' for tag in self.looseRules: _return += self.__displayRule(tag) _return += '\n' for tag in self.strictRules: _return += self.__displayRule(tag) _return = _return.replace('\n\n\n', '\n\n') _return = _return.strip() return _ def __parse(self, source=None): source = source or self.source if isinstance(source, BeautifulSoup): self.source = source element = source else: try: # It's a URI socket = urllib.urlopen(self.source) self.source = socket.read() socket.close() except: try: # It's a local file self.source = file(self.source, 'r') except: # It's a string pass element = BeautifulSoup(self.source) for tag in element(): if self.__shouldNotBeHere(tag): continue parsedTag = self.__parseTag(tag) if not parsedTag in self.strictRules: self.strictRules.append(parsedTag) if not tag.name in self.looseRules: self.looseRules.append(tag.name) def __displayRule(self, tag): return '%s {}\n\n' % tag def __getParents(self, tag): """ Get all parent elements for a tag. """ r = [] if tag.parent.parent: # The use of two parent nodes should exclude # the tag, which doesn't really have # a parent node, but the DOM treats the # document instance as a such. r.append(self.__parseTag(tag.parent, withparents=False)) if self.__getParents(tag.parent): _ = self.__getParents(tag.parent) r.append(_) r = tools.flatten(r) return r else: return '' def __parseTag(self, tag, withparents=True): tagname = tag.name try: tag['id'] except: pass else: tagname = '%s#%s' % (tagname, tag['id']) try: tag['class'] except: pass else: tagname = '%s.%s' % (tagname, tag['class'].replace(' ', '.')) if withparents: parents = self.__getParents(tag) if parents: tab = len(parents)*self.__tabs() parents = ' '.join(tools.returnreversed(parents)) tagname = '%s%s %s' % (tab, parents, tagname) return tagname def __tabs(self): if isinstance(self.config['tabsize'], int): return self.config['tabsize']*' ' elif self.config['tabsize'] == 'tab': return '\t' else: return ' ' def __shouldNotBeHere(self, tag): """ Check if tag is in the blacklist. """ if tag.name in self.nonCssTags: return True if self.__getParents(tag): for elm in self.__getParents(tag): if elm in self.nonCssTags: return True return False def usage(): print __usage__ if __name__ == '__main__': try: (opts, args) = getopt.getopt(sys.argv[1:], 's:h', ['source=', 'help']) except: usage() sys.exit() for (opt, arg) in opts: if opt in ('-h', '--help'): usage() sys.exit() elif opt in ('-s', '--source'): source = arg if not source: usage() sys.exit() h = html2css(source) print h.display()