diff --git a/htmldocx/h2d.py b/htmldocx/h2d.py index 6a0e113..6b81bf8 100644 --- a/htmldocx/h2d.py +++ b/htmldocx/h2d.py @@ -7,11 +7,12 @@ but also have api methods that let user have more control e.g. so they can nest calls to something like 'convert_chunk' in loops -user can pass existing document object as arg +user can pass existing document object as arg (if they want to manage rest of document themselves) How to deal with block level style applied over table elements? e.g. text align """ +import copy import re, argparse import io, os import urllib.request @@ -30,7 +31,7 @@ # values in inches INDENT = 0.25 LIST_INDENT = 0.5 -MAX_INDENT = 5.5 # To stop indents going off the page +MAX_INDENT = 5.5 # To stop indents going off the page # Style to use with tables. By default no style is used. DEFAULT_TABLE_STYLE = None @@ -42,17 +43,19 @@ def get_filename_from_url(url): return os.path.basename(urlparse(url).path) + def is_url(url): """ - Not to be used for actually validating a url, but in our use case we only + Not to be used for actually validating a url, but in our use case we only care if it's a url or a file path, and they're pretty distinguishable """ parts = urlparse(url) return all([parts.scheme, parts.netloc, parts.path]) + def fetch_image(url): """ - Attempts to fetch an image from a url. + Attempts to fetch an image from a url. If successful returns a bytes object, else returns None :return: @@ -64,9 +67,11 @@ def fetch_image(url): except urllib.error.URLError: return None + def remove_last_occurence(ls, x): ls.pop(len(ls) - ls[::-1].index(x) - 1) + def remove_whitespace(string, leading=False, trailing=False): """Remove white space from a string. @@ -132,12 +137,14 @@ def remove_whitespace(string, leading=False, trailing=False): # TODO need some way to get rid of extra spaces in e.g. text text return re.sub(r'\s+', ' ', string) + def delete_paragraph(paragraph): # https://github.com/python-openxml/python-docx/issues/33#issuecomment-77661907 p = paragraph._element p.getparent().remove(p) p._p = p._element = None + font_styles = { 'b': 'bold', 'strong': 'bold', @@ -160,6 +167,7 @@ def delete_paragraph(paragraph): 'LIST_NUMBER': 'List Number', } + class HtmlToDocx(HTMLParser): def __init__(self): @@ -188,9 +196,9 @@ def set_initial_attrs(self, document=None): self.doc = document else: self.doc = Document() - self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup + self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup self.document = self.doc - self.include_tables = True #TODO add this option back in? + self.include_tables = True # TODO add this option back in? self.include_images = self.options['images'] self.include_styles = self.options['styles'] self.paragraph = None @@ -233,25 +241,25 @@ def add_styles_to_run(self, style): colors = [int(x) for x in color.split(',')] elif '#' in style['color']: color = style['color'].lstrip('#') - colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4)) + colors = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4)) else: colors = [0, 0, 0] # TODO map colors to named colors (and extended colors...) # For now set color to black to prevent crashing self.run.font.color.rgb = RGBColor(*colors) - + if 'background-color' in style: if 'rgb' in style['background-color']: color = color = re.sub(r'[a-z()]+', '', style['background-color']) colors = [int(x) for x in color.split(',')] elif '#' in style['background-color']: color = style['background-color'].lstrip('#') - colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4)) + colors = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4)) else: colors = [0, 0, 0] # TODO map colors to named colors (and extended colors...) # For now set color to black to prevent crashing - self.run.font.highlight_color = WD_COLOR.GRAY_25 #TODO: map colors + self.run.font.highlight_color = WD_COLOR.GRAY_25 # TODO: map colors def apply_paragraph_style(self, style=None): try: @@ -273,14 +281,14 @@ def handle_li(self): if list_depth: list_type = self.tags['list'][-1] else: - list_type = 'ul' # assign unordered if no tag + list_type = 'ul' # assign unordered if no tag if list_type == 'ol': list_style = styles['LIST_NUMBER'] else: list_style = styles['LIST_BULLET'] - self.paragraph = self.doc.add_paragraph(style=list_style) + self.paragraph = self.doc.add_paragraph(style=list_style) self.paragraph.paragraph_format.left_indent = Inches(min(list_depth * LIST_INDENT, MAX_INDENT)) self.paragraph.paragraph_format.line_spacing = 1 @@ -350,12 +358,30 @@ def handle_table(self): if col.name == 'th': cell_html = "%s" % cell_html docx_cell = self.table.cell(cell_row, cell_col) + if col.has_attr('rowspan'): + setattr(docx_cell, 'rowspan', col['rowspan']) + if col.has_attr('colspan'): + setattr(docx_cell, 'colspan', col['colspan']) child_parser = HtmlToDocx() child_parser.copy_settings_from(self) child_parser.add_html_to_cell(cell_html, docx_cell) cell_col += 1 cell_row += 1 - + for i in range(len(rows)): + cols = self.get_table_columns(rows[i]) + for j in range(len(cols)): + if cols[j].has_attr('rowspan'): + rowspan = int(cols[j]['rowspan']) + if rowspan <= 1: + continue + merged_cell = self.table.rows[i].cells[j] + for k in range(1, rowspan): + merged_cell = merged_cell.merge(self.table.rows[i+k].cells[j]) + if cols[j].has_attr('colspan'): + colspan = int(cols[j]['colspan']) + merged_cell = self.table.rows[i].cells[j] + for k in range(1, colspan): + merged_cell = merged_cell.merge(self.table.rows[i].cells[j+k]) # skip all tags until corresponding closing tag self.instances_to_skip = len(table_soup.find_all('table')) self.skip_tag = 'table' @@ -375,7 +401,6 @@ def handle_link(self, href, text): hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink') hyperlink.set(docx.oxml.shared.qn('r:id'), rel_id) - # Create sub-run subrun = self.paragraph.add_run() rPr = docx.oxml.shared.OxmlElement('w:rPr') @@ -417,7 +442,7 @@ def handle_starttag(self, tag, attrs): return elif tag == 'ol' or tag == 'ul': self.tags['list'].append(tag) - return # don't apply styles for now + return # don't apply styles for now elif tag == 'br': self.run.add_break() return @@ -439,14 +464,14 @@ def handle_starttag(self, tag, attrs): pPr = self.paragraph._p.get_or_add_pPr() pBdr = OxmlElement('w:pBdr') pPr.insert_element_before(pBdr, - 'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', 'w:wordWrap', - 'w:overflowPunct', 'w:topLinePunct', 'w:autoSpaceDE', 'w:autoSpaceDN', - 'w:bidi', 'w:adjustRightInd', 'w:snapToGrid', 'w:spacing', 'w:ind', - 'w:contextualSpacing', 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc', - 'w:textDirection', 'w:textAlignment', 'w:textboxTightWrap', - 'w:outlineLvl', 'w:divId', 'w:cnfStyle', 'w:rPr', 'w:sectPr', - 'w:pPrChange' - ) + 'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', 'w:wordWrap', + 'w:overflowPunct', 'w:topLinePunct', 'w:autoSpaceDE', 'w:autoSpaceDN', + 'w:bidi', 'w:adjustRightInd', 'w:snapToGrid', 'w:spacing', 'w:ind', + 'w:contextualSpacing', 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc', + 'w:textDirection', 'w:textAlignment', 'w:textboxTightWrap', + 'w:outlineLvl', 'w:divId', 'w:cnfStyle', 'w:rPr', 'w:sectPr', + 'w:pPrChange' + ) bottom = OxmlElement('w:bottom') bottom.set(qn('w:val'), 'single') bottom.set(qn('w:sz'), '6') @@ -568,7 +593,35 @@ def ignore_nested_tables(self, tables_soup): def get_table_rows(self, table_soup): # If there's a header, body, footer or direct child tr tags, add row dimensions from there - return table_soup.select(', '.join(self.table_row_selectors), recursive=False) + + if len(table_soup.findChildren('tbody', recursive=False)) > 0: + rows = table_soup.select('table > tbody > tr', recursive=False) + else: + rows = table_soup.select('table > tr', recursive=False) + results = [[data for data in row.find_all('td', recursive=False)] for row in rows] + rowspan = [] + for no, tr in enumerate(rows): + tmp = [] + for td_no, data in enumerate(tr.findChildren('td', recursive=False)): + if data.has_attr("rowspan"): + rowspan.append((no, td_no, int(data["rowspan"]), data)) + if data.has_attr('colspan'): + for k in range(1, int(data['colspan'])): + results[no].insert(td_no+k, BeautifulSoup('