mirror of
https://github.com/GAM-team/GAM.git
synced 2026-06-18 13:11:37 +00:00
* Fix bug, update ducumantation * Clean up error messages * Exit on error, fix bug * One more bug fix * Update documentation, fix code l_sku can never match a_sku.lower() because it has -'s stripped and a_sku doesn't * Keep the pylint wolf at bay * Clean up code, avoid try/except
81 lines
2.2 KiB
Python
81 lines
2.2 KiB
Python
import collections
|
|
import re
|
|
import sys
|
|
from htmlentitydefs import name2codepoint
|
|
from HTMLParser import HTMLParser
|
|
from var import GM_Globals, GM_WINDOWS, GM_SYS_ENCODING
|
|
|
|
def convertUTF8(data):
|
|
if isinstance(data, str):
|
|
return data
|
|
if isinstance(data, unicode):
|
|
if GM_Globals[GM_WINDOWS]:
|
|
return data
|
|
return data.encode(GM_Globals[GM_SYS_ENCODING])
|
|
if isinstance(data, collections.Mapping):
|
|
return dict(map(convertUTF8, data.iteritems()))
|
|
if isinstance(data, collections.Iterable):
|
|
return type(data)(map(convertUTF8, data))
|
|
return data
|
|
|
|
class _DeHTMLParser(HTMLParser):
|
|
def __init__(self):
|
|
HTMLParser.__init__(self)
|
|
self.__text = []
|
|
|
|
def handle_data(self, data):
|
|
self.__text.append(data)
|
|
|
|
def handle_charref(self, name):
|
|
self.__text.append(unichr(int(name[1:], 16)) if name.startswith('x') else unichr(int(name)))
|
|
|
|
def handle_entityref(self, name):
|
|
cp = name2codepoint.get(name)
|
|
if cp:
|
|
self.__text.append(unichr(cp))
|
|
else:
|
|
self.__text.append(u'&'+name)
|
|
|
|
def handle_starttag(self, tag, attrs):
|
|
if tag == 'p':
|
|
self.__text.append('\n\n')
|
|
elif tag == 'br':
|
|
self.__text.append('\n')
|
|
elif tag == 'a':
|
|
for attr in attrs:
|
|
if attr[0] == 'href':
|
|
self.__text.append('({0}) '.format(attr[1]))
|
|
break
|
|
elif tag == 'div':
|
|
if not attrs:
|
|
self.__text.append('\n')
|
|
elif tag in ['http:', 'https']:
|
|
self.__text.append(' ({0}//{1}) '.format(tag, attrs[0][0]))
|
|
|
|
def handle_startendtag(self, tag, attrs):
|
|
if tag == 'br':
|
|
self.__text.append('\n\n')
|
|
|
|
def text(self):
|
|
return re.sub(r'\n{2}\n+', '\n\n', re.sub(r'\n +', '\n', ''.join(self.__text))).strip()
|
|
|
|
def dehtml(text):
|
|
try:
|
|
parser = _DeHTMLParser()
|
|
parser.feed(text.encode(u'utf-8'))
|
|
parser.close()
|
|
return parser.text()
|
|
except:
|
|
from traceback import print_exc
|
|
print_exc(file=sys.stderr)
|
|
return text
|
|
|
|
def indentMultiLineText(message, n=0):
|
|
return message.replace(u'\n', u'\n{0}'.format(u' '*n)).rstrip()
|
|
|
|
def formatMilliSeconds(millis):
|
|
seconds, millis = divmod(millis, 1000)
|
|
minutes, seconds = divmod(seconds, 60)
|
|
hours, minutes = divmod(minutes, 60)
|
|
return u'%02d:%02d:%02d' % (hours, minutes, seconds)
|