# -*- coding:utf8 -*- """html2markdown converts an html string to markdown while preserving unsupported markup.""" # # Copyright 2017-2018 David Lönnhager (dlon) # # Permission is hereby granted, free of charge, to any person obtaining a copy of # this software and associated documentation files (the "Software"), to deal in # the Software without restriction, including without limitation the rights to # use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies # of the Software, and to permit persons to whom the Software is furnished # to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, # INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A # PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR # COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER # IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. # import bs4 from bs4 import BeautifulSoup import re import sys if sys.version_info[0] > 2: unicode = str _supportedTags = { # NOTE: will be ignored if they have unsupported attributes (cf. _supportedAttributes) 'blockquote', 'p', 'a', 'h1','h2','h3','h4','h5','h6', 'strong','b', 'em','i', 'ul','ol','li', 'br', 'img', 'pre','code', 'hr' } _supportedAttributes = ( 'a href', 'a title', 'img alt', 'img src', 'img title', ) _inlineTags = { # these can be mixed with markdown (when unprocessed) # block tags will be surrounded by newlines and be unprocessed inside # (unless supported tag + supported attribute[s]) 'a', 'abbr', 'acronym', 'audio', 'b', 'bdi', 'bdo', 'big', #'br', 'button', #'canvas', 'cite', 'code', 'data', 'datalist', 'del', 'dfn', 'em', #'embed', 'i', #'iframe', #'img', #'input', 'ins', 'kbd', 'label', 'map', 'mark', 'meter', #'noscript', 'object', #'output', 'picture', #'progress', 'q', 'ruby', 's', 'samp', #'script', 'select', 'slot', 'small', 'span', 'strike', 'strong', 'sub', 'sup', 'svg', 'template', 'textarea', 'time', 'u', 'tt', 'var', #'video', 'wbr', } def _supportedAttrs(tag): sAttrs = [attr.split(' ')[1] for attr in _supportedAttributes if attr.split(' ')[0]==tag.name] for attr in tag.attrs: if attr not in sAttrs: return False return True def _recursivelyValid(tag): # not all tags require this property # requires: <blockquote><p style="...">asdf</p></blockquote> # does not: <div><p style="...">asdf</p></div> children = tag.find_all(recursive = False) for child in children: if not _recursivelyValid(child): return False if tag.name == '[document]': return True elif tag.name in _inlineTags: return True elif tag.name not in _supportedTags: return False if not _supportedAttrs(tag): return False return True _escapeCharSequence = tuple(r'\`*_[]#') _escapeCharRegexStr = '([{}])'.format(''.join(re.escape(c) for c in _escapeCharSequence)) _escapeCharSub = re.compile(_escapeCharRegexStr).sub def _escapeCharacters(tag): """non-recursively escape underlines and asterisks in the tag""" for i,c in enumerate(tag.contents): if type(c) != bs4.element.NavigableString: continue c.replace_with(_escapeCharSub(r'\\\1', c)) def _breakRemNewlines(tag): """non-recursively break spaces and remove newlines in the tag""" for i,c in enumerate(tag.contents): if type(c) != bs4.element.NavigableString: continue c.replace_with(re.sub(r' {2,}', ' ', c).replace('\n','')) def _markdownify(tag, _listType=None, _blockQuote=False, _listIndex=1): """recursively converts a tag into markdown""" children = tag.find_all(recursive=False) if tag.name == '[document]': for child in children: _markdownify(child) return if tag.name not in _supportedTags or not _supportedAttrs(tag): if tag.name not in _inlineTags: tag.insert_before('\n\n') tag.insert_after('\n\n') else: _escapeCharacters(tag) for child in children: _markdownify(child) return if tag.name not in ('pre', 'code'): _escapeCharacters(tag) _breakRemNewlines(tag) if tag.name == 'p': if tag.string != None: if tag.string.strip() == u'': tag.string = u'\xa0' tag.unwrap() return if not _blockQuote: tag.insert_before('\n\n') tag.insert_after('\n\n') else: tag.insert_before('\n') tag.insert_after('\n') tag.unwrap() for child in children: _markdownify(child) elif tag.name == 'br': tag.string = ' \n' tag.unwrap() elif tag.name == 'img': alt = '' title = '' if tag.has_attr('alt'): alt = tag['alt'] if tag.has_attr('title') and tag['title']: title = ' "%s"' % tag['title'] tag.string = '![%s](%s%s)' % (alt, tag['src'], title) tag.unwrap() elif tag.name == 'hr': tag.string = '\n---\n' tag.unwrap() elif tag.name == 'pre': tag.insert_before('\n\n') tag.insert_after('\n\n') if tag.code: if not _supportedAttrs(tag.code): return for child in tag.code.find_all(recursive=False): if child.name != 'br': return # code block for br in tag.code.find_all('br'): br.string = '\n' br.unwrap() tag.code.unwrap() lines = unicode(tag).strip().split('\n') lines[0] = lines[0][5:] lines[-1] = lines[-1][:-6] if not lines[-1]: lines.pop() for i,line in enumerate(lines): line = line.replace(u'\xa0', ' ') lines[i] = ' %s' % line tag.replace_with(BeautifulSoup('\n'.join(lines), 'html.parser')) return elif tag.name == 'code': # inline code if children: return tag.insert_before('`` ') tag.insert_after(' ``') tag.unwrap() elif _recursivelyValid(tag): if tag.name == 'blockquote': # ! FIXME: hack tag.insert_before('<<<BLOCKQUOTE: ') tag.insert_after('>>>') tag.unwrap() for child in children: _markdownify(child, _blockQuote=True) return elif tag.name == 'a': # process children first for child in children: _markdownify(child) if not tag.has_attr('href'): return if tag.string != tag.get('href') or tag.has_attr('title'): title = '' if tag.has_attr('title'): title = ' "%s"' % tag['title'] tag.string = '[%s](%s%s)' % (BeautifulSoup(unicode(tag), 'html.parser').string, tag.get('href', ''), title) else: # ! FIXME: hack tag.string = '<<<FLOATING LINK: %s>>>' % tag.string tag.unwrap() return elif tag.name == 'h1': tag.insert_before('\n\n# ') tag.insert_after('\n\n') tag.unwrap() elif tag.name == 'h2': tag.insert_before('\n\n## ') tag.insert_after('\n\n') tag.unwrap() elif tag.name == 'h3': tag.insert_before('\n\n### ') tag.insert_after('\n\n') tag.unwrap() elif tag.name == 'h4': tag.insert_before('\n\n#### ') tag.insert_after('\n\n') tag.unwrap() elif tag.name == 'h5': tag.insert_before('\n\n##### ') tag.insert_after('\n\n') tag.unwrap() elif tag.name == 'h6': tag.insert_before('\n\n###### ') tag.insert_after('\n\n') tag.unwrap() elif tag.name in ('ul', 'ol'): tag.insert_before('\n\n') tag.insert_after('\n\n') tag.unwrap() for i, child in enumerate(children): _markdownify(child, _listType=tag.name, _listIndex=i+1) return elif tag.name == 'li': if not _listType: # <li> outside of list; ignore return if _listType == 'ul': tag.insert_before('* ') else: tag.insert_before('%d. ' % _listIndex) for child in children: _markdownify(child) for c in tag.contents: if type(c) != bs4.element.NavigableString: continue c.replace_with('\n '.join(c.split('\n'))) tag.insert_after('\n') tag.unwrap() return elif tag.name in ('strong','b'): tag.insert_before('__') tag.insert_after('__') tag.unwrap() elif tag.name in ('em','i'): tag.insert_before('_') tag.insert_after('_') tag.unwrap() for child in children: _markdownify(child) def convert(html): """converts an html string to markdown while preserving unsupported markup.""" bs = BeautifulSoup(html, 'html.parser') _markdownify(bs) ret = unicode(bs).replace(u'\xa0', ' ') ret = re.sub(r'\n{3,}', r'\n\n', ret) # ! FIXME: hack ret = re.sub(r'<<<FLOATING LINK: (.+)>>>', r'<\1>', ret) # ! FIXME: hack sp = re.split(r'(<<<BLOCKQUOTE: .*?>>>)', ret, flags=re.DOTALL) for i,e in enumerate(sp): if e[:len('<<<BLOCKQUOTE:')] == '<<<BLOCKQUOTE:': sp[i] = '> ' + e[len('<<<BLOCKQUOTE:') : -len('>>>')] sp[i] = sp[i].replace('\n', '\n> ') ret = ''.join(sp) return ret.strip('\n')