Thursday, September 17, 2009

Visual Studio Macros to clean up HTML tables from Microsoft Word

The following Visual Studio Macros can be used to strip an HTML table copied from word down to it's core tags.
It's a little(very) rough around the edges but it seems to get the job done.

Sub CleanWordTable()

        FlattenStyles()

        DTE.ExecuteCommand("Edit.Replace")
        DTE.Find.PatternSyntax = vsFindPatternSyntax.vsFindPatternSyntaxLiteral
        DTE.Find.Action = vsFindAction.vsFindActionReplaceAll
        DTE.Find.FindWhat = "<o:p></o:p>"
        DTE.Find.ReplaceWith = ""
        DTE.Find.Target = vsFindTarget.vsFindTargetCurrentDocumentSelection
        DTE.Find.MatchCase = True
        DTE.Find.MatchWholeWord = False
        DTE.Find.MatchInHiddenText = True
        DTE.Find.PatternSyntax = vsFindPatternSyntax.vsFindPatternSyntaxLiteral
        DTE.Find.ResultsLocation = vsFindResultsLocation.vsFindResultsNone
        DTE.Find.Action = vsFindAction.vsFindActionReplaceAll
        DTE.Find.Execute()

        DTE.ExecuteCommand("Edit.Replace")
        DTE.Find.PatternSyntax = vsFindPatternSyntax.vsFindPatternSyntaxLiteral
        DTE.Find.Action = vsFindAction.vsFindActionReplaceAll
        DTE.Find.FindWhat = "class=""MsoNormal"""
        DTE.Find.ReplaceWith = ""
        DTE.Find.Target = vsFindTarget.vsFindTargetCurrentDocumentSelection
        DTE.Find.MatchCase = True
        DTE.Find.MatchWholeWord = False
        DTE.Find.MatchInHiddenText = True
        DTE.Find.PatternSyntax = vsFindPatternSyntax.vsFindPatternSyntaxLiteral
        DTE.Find.ResultsLocation = vsFindResultsLocation.vsFindResultsNone
        DTE.Find.Action = vsFindAction.vsFindActionReplaceAll
        DTE.Find.Execute()

        DTE.Find.Target = vsFindTarget.vsFindTargetCurrentDocumentSelection
        DTE.Find.Action = vsFindAction.vsFindActionReplaceAll
        DTE.Find.FindWhat = "\<span.@\>"
        DTE.Find.ReplaceWith = ""
        DTE.Find.Target = vsFindTarget.vsFindTargetCurrentDocumentSelection
        DTE.Find.MatchCase = True
        DTE.Find.MatchWholeWord = False
        DTE.Find.MatchInHiddenText = True
        DTE.Find.PatternSyntax = vsFindPatternSyntax.vsFindPatternSyntaxRegExpr
        DTE.Find.ResultsLocation = vsFindResultsLocation.vsFindResultsNone
        DTE.Find.Action = vsFindAction.vsFindActionReplaceAll
        DTE.Find.Execute()

        DTE.Find.Target = vsFindTarget.vsFindTargetCurrentDocumentSelection
        DTE.Find.Action = vsFindAction.vsFindActionReplaceAll
        DTE.Find.FindWhat = "\</span\>"
        DTE.Find.ReplaceWith = ""
        DTE.Find.Target = vsFindTarget.vsFindTargetCurrentDocumentSelection
        DTE.Find.MatchCase = True
        DTE.Find.MatchWholeWord = False
        DTE.Find.MatchInHiddenText = True
        DTE.Find.PatternSyntax = vsFindPatternSyntax.vsFindPatternSyntaxRegExpr
        DTE.Find.ResultsLocation = vsFindResultsLocation.vsFindResultsNone
        DTE.Find.Action = vsFindAction.vsFindActionReplaceAll
        DTE.Find.Execute()

        'DTE.ExecuteCommand("Edit.FormatSelection")

        DTE.Find.Target = vsFindTarget.vsFindTargetCurrentDocumentSelection
        DTE.Find.Action = vsFindAction.vsFindActionReplaceAll
        DTE.Find.FindWhat = "style="".@"""
        DTE.Find.ReplaceWith = ""
        DTE.Find.Target = vsFindTarget.vsFindTargetCurrentDocumentSelection
        DTE.Find.MatchCase = True
        DTE.Find.MatchWholeWord = False
        DTE.Find.MatchInHiddenText = True
        DTE.Find.PatternSyntax = vsFindPatternSyntax.vsFindPatternSyntaxRegExpr
        DTE.Find.ResultsLocation = vsFindResultsLocation.vsFindResultsNone
        DTE.Find.Action = vsFindAction.vsFindActionReplaceAll
        DTE.Find.Execute()

        ''DTE.ExecuteCommand("Edit.FormatSelection")

        DTE.Find.Target = vsFindTarget.vsFindTargetCurrentDocumentSelection
        DTE.Find.Action = vsFindAction.vsFindActionReplaceAll
        DTE.Find.FindWhat = "style="".@(\n.@)"""
        DTE.Find.ReplaceWith = ""
        DTE.Find.Target = vsFindTarget.vsFindTargetCurrentDocumentSelection
        DTE.Find.MatchCase = True
        DTE.Find.MatchWholeWord = False
        DTE.Find.MatchInHiddenText = True
        DTE.Find.PatternSyntax = vsFindPatternSyntax.vsFindPatternSyntaxRegExpr
        DTE.Find.ResultsLocation = vsFindResultsLocation.vsFindResultsNone
        DTE.Find.Action = vsFindAction.vsFindActionReplaceAll
        DTE.Find.Execute()

        DTE.ExecuteCommand("Edit.FormatSelection")

        DTE.Find.Action = vsFindAction.vsFindActionReplaceAll
        DTE.Find.ReplaceWith = "<td>"
        DTE.Find.FindWhat = "\<td.@\>"
        DTE.Find.Target = vsFindTarget.vsFindTargetCurrentDocumentSelection
        DTE.Find.MatchCase = True
        DTE.Find.MatchWholeWord = False
        DTE.Find.MatchInHiddenText = True
        DTE.Find.PatternSyntax = vsFindPatternSyntax.vsFindPatternSyntaxRegExpr
        DTE.Find.ResultsLocation = vsFindResultsLocation.vsFindResultsNone
        DTE.Find.Execute()

        DTE.Find.Action = vsFindAction.vsFindActionReplaceAll
        DTE.Find.FindWhat = "align=""right"""
        DTE.Find.ReplaceWith = ""
        DTE.Find.Target = vsFindTarget.vsFindTargetCurrentDocumentSelection
        DTE.Find.MatchCase = True
        DTE.Find.MatchWholeWord = False
        DTE.Find.MatchInHiddenText = True
        DTE.Find.PatternSyntax = vsFindPatternSyntax.vsFindPatternSyntaxRegExpr
        DTE.Find.ResultsLocation = vsFindResultsLocation.vsFindResultsNone
        DTE.Find.Execute()

        DTE.ExecuteCommand("Edit.FormatSelection")

        DTE.Find.FindWhat = "<p>"
        DTE.Find.ReplaceWith = ""
        DTE.Find.Target = vsFindTarget.vsFindTargetCurrentDocumentSelection
        DTE.Find.MatchCase = True
        DTE.Find.MatchWholeWord = False
        DTE.Find.MatchInHiddenText = True
        DTE.Find.PatternSyntax = vsFindPatternSyntax.vsFindPatternSyntaxLiteral
        DTE.Find.ResultsLocation = vsFindResultsLocation.vsFindResultsNone
        DTE.Find.Action = vsFindAction.vsFindActionReplaceAll
        If (DTE.Find.Execute() = vsFindResult.vsFindResultNotFound) Then
            Throw New System.Exception("vsFindResultNotFound")
        End If

        DTE.Find.FindWhat = "</p>"
        DTE.Find.ReplaceWith = ""
        DTE.Find.Target = vsFindTarget.vsFindTargetCurrentDocumentSelection
        DTE.Find.MatchCase = True
        DTE.Find.MatchWholeWord = False
        DTE.Find.MatchInHiddenText = True
        DTE.Find.PatternSyntax = vsFindPatternSyntax.vsFindPatternSyntaxLiteral
        DTE.Find.ResultsLocation = vsFindResultsLocation.vsFindResultsNone
        DTE.Find.Action = vsFindAction.vsFindActionReplaceAll
        If (DTE.Find.Execute() = vsFindResult.vsFindResultNotFound) Then
            Throw New System.Exception("vsFindResultNotFound")
        End If

        DTE.Find.FindWhat = "</b>"
        DTE.Find.ReplaceWith = ""
        DTE.Find.Target = vsFindTarget.vsFindTargetCurrentDocumentSelection
        DTE.Find.MatchCase = True
        DTE.Find.MatchWholeWord = False
        DTE.Find.MatchInHiddenText = True
        DTE.Find.PatternSyntax = vsFindPatternSyntax.vsFindPatternSyntaxLiteral
        DTE.Find.ResultsLocation = vsFindResultsLocation.vsFindResultsNone
        DTE.Find.Action = vsFindAction.vsFindActionReplaceAll
        If (DTE.Find.Execute() = vsFindResult.vsFindResultNotFound) Then
            Throw New System.Exception("vsFindResultNotFound")
        End If

        DTE.Find.FindWhat = "<b>"
        DTE.Find.ReplaceWith = ""
        DTE.Find.Target = vsFindTarget.vsFindTargetCurrentDocumentSelection
        DTE.Find.MatchCase = True
        DTE.Find.MatchWholeWord = False
        DTE.Find.MatchInHiddenText = True
        DTE.Find.PatternSyntax = vsFindPatternSyntax.vsFindPatternSyntaxLiteral
        DTE.Find.ResultsLocation = vsFindResultsLocation.vsFindResultsNone
        DTE.Find.Action = vsFindAction.vsFindActionReplaceAll
        If (DTE.Find.Execute() = vsFindResult.vsFindResultNotFound) Then
            Throw New System.Exception("vsFindResultNotFound")
        End If

        DTE.ExecuteCommand("Edit.FormatSelection")
    End Sub

    Sub FlattenStyles()
        ' Recursively Flatten style tags that span multiple lines
        DTE.Find.Target = vsFindTarget.vsFindTargetCurrentDocumentSelection
        DTE.Find.Action = vsFindAction.vsFindActionReplaceAll
        DTE.Find.FindWhat = "{style=""[^""]@}\n"
        DTE.Find.ReplaceWith = "\1"
        DTE.Find.Target = vsFindTarget.vsFindTargetCurrentDocument
        DTE.Find.MatchCase = True
        DTE.Find.MatchWholeWord = False
        DTE.Find.Backwards = False
        DTE.Find.MatchInHiddenText = True
        DTE.Find.PatternSyntax = vsFindPatternSyntax.vsFindPatternSyntaxRegExpr
        If Not (DTE.Find.Execute() = vsFindResult.vsFindResultNotFound) Then
            FlattenStyles()
        End If
    End Sub