" % get_filename_from_url(src))
- # add styles?
+ '''
+ #adding style
+ For right-alignment: `'float: right;'`
+ For center-alignment: `'display: block; margin-left: auto; margin-right: auto;'`
+ Everything else would be Left aligned
+ '''
+ if 'style' in current_attrs:
+ style = current_attrs['style']
+ image_alignment = get_image_alignment(style)
+ last_paragraph = self.doc.paragraphs[-1]
+ if image_alignment == ImageAlignment.RIGHT:
+ last_paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT
+ if image_alignment == ImageAlignment.CENTER:
+ last_paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER
def handle_table(self):
"""
@@ -355,7 +396,7 @@ def handle_table(self):
child_parser.add_html_to_cell(cell_html, docx_cell)
cell_col += 1
cell_row += 1
-
+
# skip all tags until corresponding closing tag
self.instances_to_skip = len(table_soup.find_all('table'))
self.skip_tag = 'table'
@@ -375,7 +416,6 @@ def handle_link(self, href, text):
hyperlink = docx.oxml.shared.OxmlElement('w:hyperlink')
hyperlink.set(docx.oxml.shared.qn('r:id'), rel_id)
-
# Create sub-run
subrun = self.paragraph.add_run()
rPr = docx.oxml.shared.OxmlElement('w:rPr')
@@ -417,7 +457,7 @@ def handle_starttag(self, tag, attrs):
return
elif tag == 'ol' or tag == 'ul':
self.tags['list'].append(tag)
- return # don't apply styles for now
+ return # don't apply styles for now
elif tag == 'br':
self.run.add_break()
return
@@ -439,14 +479,14 @@ def handle_starttag(self, tag, attrs):
pPr = self.paragraph._p.get_or_add_pPr()
pBdr = OxmlElement('w:pBdr')
pPr.insert_element_before(pBdr,
- 'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', 'w:wordWrap',
- 'w:overflowPunct', 'w:topLinePunct', 'w:autoSpaceDE', 'w:autoSpaceDN',
- 'w:bidi', 'w:adjustRightInd', 'w:snapToGrid', 'w:spacing', 'w:ind',
- 'w:contextualSpacing', 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc',
- 'w:textDirection', 'w:textAlignment', 'w:textboxTightWrap',
- 'w:outlineLvl', 'w:divId', 'w:cnfStyle', 'w:rPr', 'w:sectPr',
- 'w:pPrChange'
- )
+ 'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', 'w:wordWrap',
+ 'w:overflowPunct', 'w:topLinePunct', 'w:autoSpaceDE', 'w:autoSpaceDN',
+ 'w:bidi', 'w:adjustRightInd', 'w:snapToGrid', 'w:spacing', 'w:ind',
+ 'w:contextualSpacing', 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc',
+ 'w:textDirection', 'w:textAlignment', 'w:textboxTightWrap',
+ 'w:outlineLvl', 'w:divId', 'w:cnfStyle', 'w:rPr', 'w:sectPr',
+ 'w:pPrChange'
+ )
bottom = OxmlElement('w:bottom')
bottom.set(qn('w:val'), 'single')
bottom.set(qn('w:sz'), '6')
@@ -588,7 +628,7 @@ def get_tables(self):
self.include_tables = False
return
# find other way to do it, or require this dependency?
- self.tables = self.ignore_nested_tables(self.soup.find_all('table'))
+ self.tables = self.ignore_nested_tables(self.soup.find_all('table'))
self.table_no = 0
def run_process(self, html):
@@ -618,7 +658,7 @@ def add_html_to_cell(self, html, cell):
# cells must end with a paragraph or will get message about corrupt file
# https://stackoverflow.com/a/29287121
if not self.doc.paragraphs:
- self.doc.add_paragraph('')
+ self.doc.add_paragraph('')
def parse_html_file(self, filename_html, filename_docx=None):
with open(filename_html, 'r') as infile:
@@ -629,24 +669,24 @@ def parse_html_file(self, filename_html, filename_docx=None):
path, filename = os.path.split(filename_html)
filename_docx = '%s/new_docx_file_%s' % (path, filename)
self.doc.save('%s.docx' % filename_docx)
-
+
def parse_html_string(self, html):
self.set_initial_attrs()
self.run_process(html)
return self.doc
-if __name__=='__main__':
-
+
+if __name__ == '__main__':
arg_parser = argparse.ArgumentParser(description='Convert .html file into .docx file with formatting')
arg_parser.add_argument('filename_html', help='The .html file to be parsed')
arg_parser.add_argument(
- 'filename_docx',
- nargs='?',
- help='The name of the .docx file to be saved. Default new_docx_file_[filename_html]',
+ 'filename_docx',
+ nargs='?',
+ help='The name of the .docx file to be saved. Default new_docx_file_[filename_html]',
default=None
)
- arg_parser.add_argument('--bs', action='store_true',
- help='Attempt to fix html before parsing. Requires bs4. Default True')
+ arg_parser.add_argument('--bs', action='store_true',
+ help='Attempt to fix html before parsing. Requires bs4. Default True')
args = vars(arg_parser.parse_args())
file_html = args.pop('filename_html')
diff --git a/tests/text1.html b/tests/text1.html
index 71b8e2a..6c79427 100644
--- a/tests/text1.html
+++ b/tests/text1.html
@@ -14,7 +14,11 @@
2 + 3 = 5↵this is code
A picture from file: 
A picture from url: 
+A centered picture from url: 
+A right aligned picture from url: 
A picture from url that's broken: 
+A picture with height and width: 
+A picture with height and width and centered: 
heading 1
- Ordered list first item