Link to home
Start Free TrialLog in
Avatar of Murray Brown
Murray BrownFlag for United Kingdom of Great Britain and Northern Ireland

asked on

Word VBA Convert formatted text to markup

Hi

In Word VBA is it possible to convert formatted text into markup language like html?

Thanks
Avatar of aikimark
aikimark
Flag of United States of America image

I'm going to say "yes", but it is contingent on what your markup looks like.  Please post a representative example.
ASKER CERTIFIED SOLUTION
Avatar of Daniel Pineault
Daniel Pineault

Link to home
membership
This solution is only available to members.
To access this solution, you must be a member of Experts Exchange.
Start Free Trial
Avatar of Murray Brown

ASKER

Thanks. That makes sense
If it's helpful, I use the VBA code below to convert Word to simple HTML. The code handles hyperlinks, bold, italic and underline formatting, standard Word heading styles (Heading1, Heading2 etc) plus bulleted and simple numbered lists.

It does not handle tables or images, though it could be extended to do so fairly easily. There are four routines and a function (which creates a closing HTML tag).

The main routine calls the others.

In essence it does the following:
  • Works with your current word selection
  • If nothing is selected it converts the whole document
  • Converts Word hyperlinks to HTML setting the html target to "_blank"
  • Runs through all paragraphs in the selection and
  • - Converts lists to their HTML equivalent
  • - converts ordinary paragraphs to <p> tagged HTML
  • - converts Heading1 to <H1> etc
  • Calls another routine to search for and convert bold, italic and underline Word formatting to HTML tags

Hope it is useful

Sub Word2HTML()
'''''''''''''''''''''''''''''''''''''''''''
' Converts Word selection to HTML         '
' Converts whole document if no selection '
' Neil Fleming, 2018                      '
'''''''''''''''''''''''''''''''''''''''''''
Dim ss As Selection, rr As Range, wPara As Paragraph
Dim sTag As String, sList As String, sHead As String
Dim inList As Boolean, iPos As Long

Set ss = Selection
'if nothing selected, select whole document
If ss.End = ss.start Then ActiveDocument.Range.Select

'work with range rather than selection
Set rr = ss.Range
'call ConvertHyperlinks to convert hyperlinks as target=_blank
ConvertHyperlinks2HTML rr

'iterate paragraphs
For Each wPara In rr.Paragraphs
    'check for bulleted or simple numbered lists and set HTML tags
    Select Case wPara.Range.ListFormat.ListType
        Case WdListType.wdListBullet: sList = "<ul>": sTag = "<li>"
        Case WdListType.wdListSimpleNumbering: sList = "<ol>": sTag = "<li>"
        Case WdListType.wdListNoNumbering: sTag = "<p>"
    End Select
    
    'check for standard Word heading styles
    sHead = wPara.Style
    If Left(sHead, 7) = "Heading" Then sTag = "<h" & Right(sHead, 1) & ">"
    
    
    Select Case sTag
        Case "<li>"
            'remove Word list format
            wPara.Range.ListFormat.RemoveNumbers
            'on first occurrence of list tag add the <ul> or <ol> opener
            If Not inList Then
            inList = True
            wPara.Range.InsertBefore sList & Chr(13) & sTag
            Else
            wPara.Range.InsertBefore sTag
            End If
            'call AddEndTag routine, ensures tag is not accidentally formatted bold,italic, underline
            AddEndTag wPara.Range, sTag
            
        Case "<p>", "<h1>", "<h2>", "<h3>", "<h4>"
            'at end of previous list, if one exists, insert closing tag for HTML list
            If inList Then
            inList = False
            wPara.Range.InsertBefore eTag(sList) & Chr(13) & sTag
            Else
            wPara.Range.InsertBefore sTag
            End If
            AddEndTag wPara.Range, sTag
    End Select
    
'check for blank paragraphs - adjacent html open and close tags - and add non-breaking space
iPos = InStr(wPara.Range, sTag & eTag(sTag))
If iPos > 0 Then
wPara.Range.Characters(iPos + (Len(sTag) - 1)).InsertAfter "&nbsp;"
End If
Next

'convert Word bold, italic, underline to html
FormatIBU2HTML rr

End Sub

Sub AddEndTag(rText As Range, sTag As String)
Dim iLen As Long, rFormat As Range
iLen = Len(rText.Text)
rText.Characters(Len(rText.Text) - 1).InsertAfter eTag(sTag)
'ensure html tag is not bold underline or italic
Set rFormat = rText
rFormat.start = rText.start + (iLen - 1)
rFormat.Font.Bold = False: rFormat.Font.Italic = False: rFormat.Font.Underline = False
End Sub

Function eTag(s As String) As String
'returns closing HTML tag for any given opening tag
Dim iPos As Long
iPos = InStr(s, " ")
If iPos = 0 Then
eTag = "</" & Right(s, Len(s) - 1)
Else
eTag = "</" & Mid(s, 2, iPos - 2) & ">"
End If
End Function

Sub FormatIBU2HTML(rr As Range)
'formats italic, bold, underline as HTML
Dim rFind As Range
Dim iStart As Long, iStop As Long, iFound As Long
Dim notFound As Boolean
Const cTags = "<i>,<b>,<u>"
Dim aTag() As String
Dim iTag As Long

'create array of tags
aTag = Split(cTags, ",")
iStart = rr.start
iStop = rr.End
Set rFind = rr
'set up rFind find parameters
With rFind.Find
    .ClearFormatting
    .Forward = True
    .Format = False
    .MatchCase = False
    .MatchWholeWord = False
    .MatchSoundsLike = False
    .MatchAllWordForms = False
    .MatchWildcards = True
    .Text = "*"
    .Wrap = wdFindStop

'find formatting for each tag

For iTag = 0 To 2
    notFound = False
    Do
    rFind.start = iStart
    rFind.End = iStop
   'Set format for search
   .ClearFormatting
    Select Case iTag
    Case 0: .Font.Italic = True
    Case 1: .Font.Bold = True
    Case 2: .Font.Underline = True
    End Select
       
 'search
 If .Execute Then
    iFound = rFind.start
   'Set format to find end of formatting
    Select Case iTag
    Case 0: .Font.Italic = False
    Case 1: .Font.Bold = False
    Case 2: .Font.Underline = False
    End Select
   
   'find end of formatting
   rFind.End = rr.End
   If .Execute Then
   rFind.End = rFind.start
   rFind.start = iFound
   Else
   notFound = True
   End If
   'remove Word formatting
   Select Case iTag
    Case 0: rFind.Font.Italic = False
    Case 1: rFind.Font.Bold = False
    Case 2: rFind.Font.Underline = False
    End Select
   'add HTML formatting tags
   rFind.InsertBefore aTag(iTag)
   rFind.InsertAfter eTag(aTag(iTag))
 Else
 notFound = True
 End If
 Loop Until notFound
Next
End With
rFind.start = iStart
rFind.End = iStop
End Sub
Sub ConvertHyperlinks2HTML(rr As Range)
Dim hh As Hyperlink

For Each hh In rr.Hyperlinks
    'only convert text hyperlinks
    If hh.Type = msoHyperlinkRange Then
    hh.Range.Font.Underline = False
    hh.Range.InsertAfter "</a>"
    hh.Range.InsertBefore "<a target='_blank' href='" & hh.Address & "'>"
    End If

Next
End Sub

Open in new window