#!/usr/bin/env python
"""html2css.
Turn a HTML resource (url, file or string) into the corresponding CSS.
Visit http://code.google.com/p/html2css/ to view the newest version.
Requires the BeautifulSoup library (version 3.0.3), which comes bundled.
"""
__author__ = 'Jonathan Holst (holst.biz)'
__version__ = '0.2'
__date__ = '2007-09-01'
__license__ = 'lGPL'
from beautifulsoup import BeautifulSoup
import sys, getopt, urllib, tools
class html2css(object):
config = {
'tabsize': 1 # Number of spaces, or 'tab' if \t
}
nonCssTags = (u'head',)
def __init__(self, source):
self.source = source
self.looseRules = []
self.strictRules = []
self.__parse()
def display(self):
_return = ''
for tag in self.looseRules:
_return += self.__displayRule(tag)
_return += '\n'
for tag in self.strictRules:
_return += self.__displayRule(tag)
_return = _return.replace('\n\n\n', '\n\n')
_return = _return.strip()
return _
def __parse(self, source=None):
source = source or self.source
if isinstance(source, BeautifulSoup):
self.source = source
element = source
else:
try:
# It's a URI
socket = urllib.urlopen(self.source)
self.source = socket.read()
socket.close()
except:
try:
# It's a local file
self.source = file(self.source, 'r')
except:
# It's a string
pass
element = BeautifulSoup(self.source)
for tag in element():
if self.__shouldNotBeHere(tag):
continue
parsedTag = self.__parseTag(tag)
if not parsedTag in self.strictRules:
self.strictRules.append(parsedTag)
if not tag.name in self.looseRules:
self.looseRules.append(tag.name)
def __displayRule(self, tag):
return '%s {}\n\n' % tag
def __getParents(self, tag):
"""
Get all parent elements for a tag.
"""
r = []
if tag.parent.parent: # The use of two parent nodes should exclude
# the tag, which doesn't really have
# a parent node, but the DOM treats the
# document instance as a such.
r.append(self.__parseTag(tag.parent, withparents=False))
if self.__getParents(tag.parent):
_ = self.__getParents(tag.parent)
r.append(_)
r = tools.flatten(r)
return r
else:
return ''
def __parseTag(self, tag, withparents=True):
tagname = tag.name
try:
tag['id']
except:
pass
else:
tagname = '%s#%s' % (tagname, tag['id'])
try:
tag['class']
except:
pass
else:
tagname = '%s.%s' % (tagname, tag['class'].replace(' ', '.'))
if withparents:
parents = self.__getParents(tag)
if parents:
tab = len(parents)*self.__tabs()
parents = ' '.join(tools.returnreversed(parents))
tagname = '%s%s %s' % (tab, parents, tagname)
return tagname
def __tabs(self):
if isinstance(self.config['tabsize'], int):
return self.config['tabsize']*' '
elif self.config['tabsize'] == 'tab':
return '\t'
else:
return ' '
def __shouldNotBeHere(self, tag):
"""
Check if tag is in the blacklist.
"""
if tag.name in self.nonCssTags:
return True
if self.__getParents(tag):
for elm in self.__getParents(tag):
if elm in self.nonCssTags:
return True
return False
def usage():
print __usage__
if __name__ == '__main__':
try:
(opts, args) = getopt.getopt(sys.argv[1:], 's:h', ['source=', 'help'])
except:
usage()
sys.exit()
for (opt, arg) in opts:
if opt in ('-h', '--help'):
usage()
sys.exit()
elif opt in ('-s', '--source'):
source = arg
if not source:
usage()
sys.exit()
h = html2css(source)
print h.display()