diff --git a/htmldocx/h2d.py b/htmldocx/h2d.py index 6a0e113..3a6ff03 100644 --- a/htmldocx/h2d.py +++ b/htmldocx/h2d.py @@ -12,25 +12,29 @@ How to deal with block level style applied over table elements? e.g. text align """ -import re, argparse -import io, os +import argparse +import enum +import io +import os +import re import urllib.request -from urllib.parse import urlparse from html.parser import HTMLParser +from urllib.parse import urlparse -import docx, docx.table +import docx +import docx.table +from bs4 import BeautifulSoup from docx import Document -from docx.shared import RGBColor, Pt, Inches from docx.enum.text import WD_COLOR, WD_ALIGN_PARAGRAPH from docx.oxml import OxmlElement from docx.oxml.ns import qn - -from bs4 import BeautifulSoup +from docx.shared import Pt +from docx.shared import RGBColor, Inches # values in inches INDENT = 0.25 LIST_INDENT = 0.5 -MAX_INDENT = 5.5 # To stop indents going off the page +MAX_INDENT = 5.5 # To stop indents going off the page # Style to use with tables. By default no style is used. DEFAULT_TABLE_STYLE = None @@ -42,6 +46,7 @@ def get_filename_from_url(url): return os.path.basename(urlparse(url).path) + def is_url(url): """ Not to be used for actually validating a url, but in our use case we only @@ -50,6 +55,7 @@ def is_url(url): parts = urlparse(url) return all([parts.scheme, parts.netloc, parts.path]) + def fetch_image(url): """ Attempts to fetch an image from a url. @@ -64,9 +70,11 @@ def fetch_image(url): except urllib.error.URLError: return None + def remove_last_occurence(ls, x): ls.pop(len(ls) - ls[::-1].index(x) - 1) + def remove_whitespace(string, leading=False, trailing=False): """Remove white space from a string. @@ -132,12 +140,28 @@ def remove_whitespace(string, leading=False, trailing=False): # TODO need some way to get rid of extra spaces in e.g. text text return re.sub(r'\s+', ' ', string) + def delete_paragraph(paragraph): # https://github.com/python-openxml/python-docx/issues/33#issuecomment-77661907 p = paragraph._element p.getparent().remove(p) p._p = p._element = None + +class ImageAlignment(enum.Enum): + LEFT = 1 + CENTER = 2 + RIGHT = 3 + + +def get_image_alignment(image_style): + if image_style == 'float: right;': + return ImageAlignment.RIGHT + if image_style == 'display: block; margin-left: auto; margin-right: auto;': + return ImageAlignment.CENTER + return ImageAlignment.LEFT + + font_styles = { 'b': 'bold', 'strong': 'bold', @@ -160,6 +184,7 @@ def delete_paragraph(paragraph): 'LIST_NUMBER': 'List Number', } + class HtmlToDocx(HTMLParser): def __init__(self): @@ -188,9 +213,9 @@ def set_initial_attrs(self, document=None): self.doc = document else: self.doc = Document() - self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup + self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup self.document = self.doc - self.include_tables = True #TODO add this option back in? + self.include_tables = True # TODO add this option back in? self.include_images = self.options['images'] self.include_styles = self.options['styles'] self.paragraph = None @@ -233,25 +258,25 @@ def add_styles_to_run(self, style): colors = [int(x) for x in color.split(',')] elif '#' in style['color']: color = style['color'].lstrip('#') - colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4)) + colors = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4)) else: colors = [0, 0, 0] # TODO map colors to named colors (and extended colors...) # For now set color to black to prevent crashing self.run.font.color.rgb = RGBColor(*colors) - + if 'background-color' in style: if 'rgb' in style['background-color']: color = color = re.sub(r'[a-z()]+', '', style['background-color']) colors = [int(x) for x in color.split(',')] elif '#' in style['background-color']: color = style['background-color'].lstrip('#') - colors = tuple(int(color[i:i+2], 16) for i in (0, 2, 4)) + colors = tuple(int(color[i:i + 2], 16) for i in (0, 2, 4)) else: colors = [0, 0, 0] # TODO map colors to named colors (and extended colors...) # For now set color to black to prevent crashing - self.run.font.highlight_color = WD_COLOR.GRAY_25 #TODO: map colors + self.run.font.highlight_color = WD_COLOR.GRAY_25 # TODO: map colors def apply_paragraph_style(self, style=None): try: @@ -273,22 +298,22 @@ def handle_li(self): if list_depth: list_type = self.tags['list'][-1] else: - list_type = 'ul' # assign unordered if no tag + list_type = 'ul' # assign unordered if no tag if list_type == 'ol': list_style = styles['LIST_NUMBER'] else: list_style = styles['LIST_BULLET'] - self.paragraph = self.doc.add_paragraph(style=list_style) + self.paragraph = self.doc.add_paragraph(style=list_style) self.paragraph.paragraph_format.left_indent = Inches(min(list_depth * LIST_INDENT, MAX_INDENT)) self.paragraph.paragraph_format.line_spacing = 1 - def add_image_to_cell(self, cell, image): + def add_image_to_cell(self, cell, image, width, height): # python-docx doesn't have method yet for adding images to table cells. For now we use this paragraph = cell.add_paragraph() run = paragraph.add_run() - run.add_picture(image) + run.add_picture(image, width, height) def handle_img(self, current_attrs): if not self.include_images: @@ -296,6 +321,9 @@ def handle_img(self, current_attrs): self.skip_tag = 'img' return src = current_attrs['src'] + # added image dimension, interpreting values as pixel only + height = Pt(int(current_attrs['height'][:-2])) if 'height' in current_attrs else None + width = Pt(int(current_attrs['width'][:-2])) if 'width' in current_attrs else None # fetch image src_is_url = is_url(src) if src_is_url: @@ -309,9 +337,9 @@ def handle_img(self, current_attrs): if image: try: if isinstance(self.doc, docx.document.Document): - self.doc.add_picture(image) + self.doc.add_picture(image, width, height) else: - self.add_image_to_cell(self.doc, image) + self.add_image_to_cell(self.doc, image, width, height) except FileNotFoundError: image = None if not image: @@ -320,7 +348,20 @@ def handle_img(self, current_attrs): else: # avoid exposing filepaths in document self.doc.add_paragraph("" % get_filename_from_url(src)) - # add styles? + ''' + #adding style + For right-alignment: `'float: right;'` + For center-alignment: `'display: block; margin-left: auto; margin-right: auto;'` + Everything else would be Left aligned + ''' + if 'style' in current_attrs: + style = current_attrs['style'] + image_alignment = get_image_alignment(style) + last_paragraph = self.doc.paragraphs[-1] + if image_alignment == ImageAlignment.RIGHT: + last_paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT + if image_alignment == ImageAlignment.CENTER: + last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER def handle_table(self): """ @@ -355,7 +396,7 @@ def handle_table(self): child_parser.add_html_to_cell(cell_html, docx_cell) cell_col += 1 cell_row += 1 - + # skip all tags until corresponding closing tag self.instances_to_skip = len(table_soup.find_all('table')) self.skip_tag = 'table' @@ -375,7 +416,6 @@ def handle_link(self, href, text): hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink') hyperlink.set(docx.oxml.shared.qn('r:id'), rel_id) - # Create sub-run subrun = self.paragraph.add_run() rPr = docx.oxml.shared.OxmlElement('w:rPr') @@ -417,7 +457,7 @@ def handle_starttag(self, tag, attrs): return elif tag == 'ol' or tag == 'ul': self.tags['list'].append(tag) - return # don't apply styles for now + return # don't apply styles for now elif tag == 'br': self.run.add_break() return @@ -439,14 +479,14 @@ def handle_starttag(self, tag, attrs): pPr = self.paragraph._p.get_or_add_pPr() pBdr = OxmlElement('w:pBdr') pPr.insert_element_before(pBdr, - 'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', 'w:wordWrap', - 'w:overflowPunct', 'w:topLinePunct', 'w:autoSpaceDE', 'w:autoSpaceDN', - 'w:bidi', 'w:adjustRightInd', 'w:snapToGrid', 'w:spacing', 'w:ind', - 'w:contextualSpacing', 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc', - 'w:textDirection', 'w:textAlignment', 'w:textboxTightWrap', - 'w:outlineLvl', 'w:divId', 'w:cnfStyle', 'w:rPr', 'w:sectPr', - 'w:pPrChange' - ) + 'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', 'w:wordWrap', + 'w:overflowPunct', 'w:topLinePunct', 'w:autoSpaceDE', 'w:autoSpaceDN', + 'w:bidi', 'w:adjustRightInd', 'w:snapToGrid', 'w:spacing', 'w:ind', + 'w:contextualSpacing', 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc', + 'w:textDirection', 'w:textAlignment', 'w:textboxTightWrap', + 'w:outlineLvl', 'w:divId', 'w:cnfStyle', 'w:rPr', 'w:sectPr', + 'w:pPrChange' + ) bottom = OxmlElement('w:bottom') bottom.set(qn('w:val'), 'single') bottom.set(qn('w:sz'), '6') @@ -588,7 +628,7 @@ def get_tables(self): self.include_tables = False return # find other way to do it, or require this dependency? - self.tables = self.ignore_nested_tables(self.soup.find_all('table')) + self.tables = self.ignore_nested_tables(self.soup.find_all('table')) self.table_no = 0 def run_process(self, html): @@ -618,7 +658,7 @@ def add_html_to_cell(self, html, cell): # cells must end with a paragraph or will get message about corrupt file # https://stackoverflow.com/a/29287121 if not self.doc.paragraphs: - self.doc.add_paragraph('') + self.doc.add_paragraph('') def parse_html_file(self, filename_html, filename_docx=None): with open(filename_html, 'r') as infile: @@ -629,24 +669,24 @@ def parse_html_file(self, filename_html, filename_docx=None): path, filename = os.path.split(filename_html) filename_docx = '%s/new_docx_file_%s' % (path, filename) self.doc.save('%s.docx' % filename_docx) - + def parse_html_string(self, html): self.set_initial_attrs() self.run_process(html) return self.doc -if __name__=='__main__': - + +if __name__ == '__main__': arg_parser = argparse.ArgumentParser(description='Convert .html file into .docx file with formatting') arg_parser.add_argument('filename_html', help='The .html file to be parsed') arg_parser.add_argument( - 'filename_docx', - nargs='?', - help='The name of the .docx file to be saved. Default new_docx_file_[filename_html]', + 'filename_docx', + nargs='?', + help='The name of the .docx file to be saved. Default new_docx_file_[filename_html]', default=None ) - arg_parser.add_argument('--bs', action='store_true', - help='Attempt to fix html before parsing. Requires bs4. Default True') + arg_parser.add_argument('--bs', action='store_true', + help='Attempt to fix html before parsing. Requires bs4. Default True') args = vars(arg_parser.parse_args()) file_html = args.pop('filename_html') diff --git a/tests/text1.html b/tests/text1.html index 71b8e2a..6c79427 100644 --- a/tests/text1.html +++ b/tests/text1.html @@ -14,7 +14,11 @@
2 + 3 = 5↵this is code

A picture from file:

A picture from url:

+

A centered picture from url:

+

A right aligned picture from url:

A picture from url that's broken:

+

A picture with height and width:

+

A picture with height and width and centered:

heading 1

  1. Ordered list first item