How to programmatically iterate through subscripts,superscripts and equations found in a Word document
I have a few Word documents, each containing a few hundred pages of scientific data which includes:
- Chemical formulae (H2SO4 with all proper subscripts & superscripts)
- Scientific numbers (exponents formatted using superscripts)
- Lots of Mathematical Equations. Written using mathematical equation editor in Word.
Problem is, storing this data in Word is not efficient for us. So we want to store all this information in a database (MySQL). We want to convert the formatting to LaTex.
Is there any way to iterate through all the subcripts, superscripts and equations within a Word document using VBA?
Yes there is. I would sugest using Powershell as it handles Word files quite well. I think i will be the easiest way.
More on Powershell vs Word automation in here: http://www.simple-talk.com/dotnet/.net-tools/com-automation-of-office-applications-via-powershell/
I have digged a little deeper and i found this powershell script:
param([string]$docpath,[string]$htmlpath = $docpath)
$srcfiles = Get-ChildItem $docPath -filter "*.doc"
$saveFormat = [Enum]::Parse([Microsoft.Office.Interop.Word.WdSaveFormat], "wdFormatFilteredHTML");
$word = new-object -comobject word.application
$word.Visible = $False
function saveas-filteredhtml
{
$opendoc = $word.documents.open($doc.FullName);
$opendoc.saveas([ref]"$htmlpath\$doc.fullname.html", [ref]$saveFormat);
$opendoc.close();
}
ForEach ($doc in $srcfiles)
{
Write-Host "Processing :" $doc.FullName
saveas-filteredhtml
$doc = $null
}
$word.quit();
Save it as .ps1 and start it with:
convertdoc-tohtml.ps1 -docpath "C:\Documents" -htmlpath "C:\Output"
It will save all the .doc file from specified directory, as the html files. So i have a doc file in which i have your H2SO4 with subscripts and after powershell convertion the output is following:
<html>
<head>
<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
<meta name=Generator content="Microsoft Word 14 (filtered)">
<style>
<!--
/* Font Definitions */
@font-face
{font-family:Calibri;
panose-1:2 15 5 2 2 2 4 3 2 4;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
{margin-top:0in;
margin-right:0in;
margin-bottom:10.0pt;
margin-left:0in;
line-height:115%;
font-size:11.0pt;
font-family:"Calibri","sans-serif";}
.MsoChpDefault
{font-family:"Calibri","sans-serif";}
.MsoPapDefault
{margin-bottom:10.0pt;
line-height:115%;}
@page WordSection1
{size:8.5in 11.0in;
margin:1.0in 1.0in 1.0in 1.0in;}
div.WordSection1
{page:WordSection1;}
-->
</style>
</head>
<body lang=EN-US>
<div class=WordSection1>
<p class=MsoNormal><span lang=PL>H<sub>2</sub>SO<sub>4</sub></span></p>
</div>
</body>
</html>
As you can see subscripts have their own tags in HTML so only thing that is left is to parse the file in bash or c++ to cut from body to /body , change the to LATEX and remove the rest of HTML tags afterwards.
Code from http://blogs.technet.com/b/bshukla/archive/2011/09/27/3347395.aspx
So i've developed a parser in C++ to look for HTML subscript and replace it with LATEX subscript.
The code:
#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <vector>
using namespace std;
vector < vector <string> > parse( vector < vector <string> > vec, string filename )
{
/*
PARSES SPECIFIED FILE. EACH WORD SEPARATED AND
PLACED IN VECTOR FIELD.
REQUIRED INCLUDES:
#include <iostream>
#include <fstream>
#include <string>
#include <sstream>
#include <vector>
EXPECTS: TWO DIMENTIONAL VECTOR
STRING WITH FILENAME
RETURNS: TWO DIMENTIONAL VECTOR
vec[lines][words]
*/
string vword;
ifstream vfile;
string tmp;
// FILENAME CONVERSION FROM STING
// TO CHAR TABLE
char cfilename[filename.length()+1];
if( filename.length() < 126 )
{
for(int i = 0; i < filename.length(); i++)
cfilename[i] = filename[i];
cfilename[filename.length()] = '\0';
}
else return vec;
// OPENING FILE
//
vfile.open( cfilename );
if (vfile.is_open())
{
while ( vfile.good() )
{
getline( vfile, vword );
vector < string > vline;
vline.clear();
for (int i = 0; i < vword.length(); i++)
{
tmp = "";
// PARSING CONTENT. OMITTING SPACES AND TABS
//
while (vword[i] != ' ' && vword[i] != ((char)9) && i < vword.length() )
tmp += vword[i++];
if( tmp.length() > 0 ) vline.push_back(tmp);
}
if (!vline.empty())
vec.push_back(vline);
}
vfile.close();
}
else cout << "Unable to open file " << filename << ".\n";
return vec;
}
int main()
{
vector < vector < string > > vec;
vec = parse( vec, "parse.html" );
bool body = false;
for (int i = 0; i < vec.size(); i++)
{
for (int j = 0; j < vec[i].size(); j++)
{
if ( vec[i][j] == "<body") body=true;
if ( vec[i][j] == "</body>" ) body=false;
if ( body == true )
{
for ( int k=0; k < vec[i][j].size(); k++ )
{
if (k+4 < vec[i][j].size() )
{
if ( vec[i][j][k] == '<' &&
vec[i][j][k+1] == 's' &&
vec[i][j][k+2] == 'u' &&
vec[i][j][k+3] == 'b' &&
vec[i][j][k+4] == '>' )
{
string tmp = "";
while (vec[i][j][k+5] != '<')
{
tmp+=vec[i][j][k+5];
k++;
}
tmp = "_{" + tmp + "}";
k=k+5+5;
cout << tmp << endl;;
}
else cout << vec[i][j][k];
}
else cout << vec[i][j][k];
}
cout << endl;
}
}
}
return 0;
}
For the html file:
<html>
<head>
<meta http-equiv=Content-Type content="text/html; charset=windows-1252">
<meta name=Generator content="Microsoft Word 14 (filtered)">
<style>
<!--
/* Font Definitions */
@font-face
{font-family:Calibri;
panose-1:2 15 5 2 2 2 4 3 2 4;}
/* Style Definitions */
p.MsoNormal, li.MsoNormal, div.MsoNormal
{margin-top:0in;
margin-right:0in;
margin-bottom:10.0pt;
margin-left:0in;
line-height:115%;
font-size:11.0pt;
font-family:"Calibri","sans-serif";}
.MsoChpDefault
{font-family:"Calibri","sans-serif";}
.MsoPapDefault
{margin-bottom:10.0pt;
line-height:115%;}
@page WordSection1
{size:8.5in 11.0in;
margin:1.0in 1.0in 1.0in 1.0in;}
div.WordSection1
{page:WordSection1;}
-->
</style>
</head>
<body lang=EN-US>
<div class=WordSection1>
<p class=MsoNormal><span lang=PL>H<sub>2</sub>SO<sub>4</sub></span></p>
</div>
</body>
</html>
The output is:
<body
lang=EN-US>
<div
class=WordSection1>
<p
class=MsoNormal><span
lang=PL>H_{2}
SO_{4}
</span></p>
</div>
It's not ideal of course, but treat is as proof of concept.
You can extract the xml directly from any office document that is 2007+. This is done in the following fashion:
- rename the file from .docx to .zip
- extract the file using 7zip (or some other extraction program)
- For the actual content of the document look in extracted folder under the
word
subfolder and thedocument.xml
file. That should contain all content of the document.
I created a sample document, and in the body tags I found this (note I quickly put this together, so the formatting might be a little off):
<?xml version="1.0" encoding="UTF-8" standalone="true"?>
<w:body>
-<w:p w:rsidRDefault="000E0C3A" w:rsidR="008B5DAA">
-<w:r>
<w:t xml:space="preserve">This </w:t>
</w:r>
- <w:r w:rsidRPr="000E0C3A">
-<w:rPr>
<w:vertAlign w:val="superscript"/>
</w:rPr>
<w:t>is</w:t>
</w:r>
- <w:r>
<w:t xml:space="preserve"> a </w:t>
</w:r>
-<w:r w:rsidRPr="000E0C3A">
-<w:rPr>
<w:vertAlign w:val="subscript"/>
</w:rPr>
<w:t>test</w:t>
</w:r>
-<w:r>
<w:t>.</w:t>
</w:r>
</w:p>
</w:body>
It appears that the <w:t>
tag is for text the <w:rPr>
is the definition of the font and the <w:p>
is a new paragraph.
The word equivalent looks like this:
I have been looking at a different approach from that pursued by mnmnc.
My attempts to save a test Word document as HTML were not a success. I have found in the past that Office generated HTML is so full of chaff that picking out the bits you want is near to impossible. I have found that to be the case here. I have also had a problem with equations. Word saves equations as images. For each equation there will be two images one with an extension of WMZ and one with an extension of GIF. If you display the html file with Google Chrome, the equations look OK but not wonderful; the appearance matches the GIF file when displayed with an image display/edit tool that can handle transparent images. If you display the HTML file with Internet Explorer, the equations look perfect. The HTML references the WMZ files so I assume Internet Explorer contains an extension to display WMZ files which are apparently Windows Media Player skins although WMP claims they are corrupt.
Additional information
I should have included this information in the original answer.
I created a small Word document which I saved as Html. The three panels in the image below shows the original Word document, the Html document as displayed by Microsoft Internet Explorer and the Html document as displayed by Google Chrome.
As explained earlier the difference between the IE and Chrome images is the result of the equations being saved twice, once in WMZ format and once in GIF format. The Html is too large to show here.
The Html created by the macro is:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Frameset//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head><meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
</head><body>
<p>Some ordinary text.</p>
<p>H<sub>2</sub>SO<sub>4</sub>.</p>
<p>Abc & def > ghi < jkl</p>
<p>x<sup>3</sup>+ x<sup>2</sup>+3x+4=0.</p><p></p>
<p><i>Equation</i> </p>
<p>Mno</p>
<p><i>Equation</i></p>
</body></html>
Which displays as:
I have not attempted to convert the equations since the free MathType Software Development Kit apparently includes routines that convert to LaTex
The code is pretty basic so not many comments. Ask if anything is unclear. Note: this is an improved version of the original code.
Sub ConvertToHtml()
Dim FileNum As Long
Dim NumPendingCR As Long
Dim objChr As Object
Dim PathCrnt As String
Dim rng As Word.Range
Dim WithinPara As Boolean
Dim WithinSuper As Boolean
Dim WithinSub As Boolean
FileNum = FreeFile
PathCrnt = ActiveDocument.Path
Open PathCrnt & "\TestWord.html" For Output Access Write Lock Write As #FileNum
Print #FileNum, "<!DOCTYPE html PUBLIC ""-//W3C//DTD XHTML 1.0 Frameset//EN""" & _
" ""http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"">" & _
vbCr & vbLf & "<html xmlns=""http://www.w3.org/1999/xhtml"" " & _
"xml:lang=""en"" lang=""en"">" & vbCr & vbLf & _
"<head><meta http-equiv=""Content-Type"" content=""text/html; " _
& "charset=utf-8"" />" & vbCr & vbLf & "</head><body>"
For Each rng In ActiveDocument.StoryRanges
NumPendingCR = 0
WithinPara = False
WithinSub = False
WithinSuper = False
Do While Not (rng Is Nothing)
For Each objChr In rng.Characters
If objChr.Font.Superscript Then
If Not WithinSuper Then
' Start of superscript
Print #FileNum, "<sup>";
WithinSuper = True
End If
ElseIf WithinSuper Then
' End of superscript
Print #FileNum, "</sup>";
WithinSuper = False
End If
If objChr.Font.Subscript Then
If Not WithinSub Then
' Start of subscript
Print #FileNum, "<sub>";
WithinSub = True
End If
ElseIf WithinSub Then
' End of subscript
Print #FileNum, "</sub>";
WithinSub = False
End If
Select Case objChr
Case vbCr
NumPendingCR = NumPendingCR + 1
Case "&"
Print #FileNum, CheckPara(NumPendingCR, WithinPara) & "&";
Case "<"
Print #FileNum, CheckPara(NumPendingCR, WithinPara) & "<";
Case ">"
Print #FileNum, CheckPara(NumPendingCR, WithinPara) & ">";
Case Chr(1)
Print #FileNum, CheckPara(NumPendingCR, WithinPara) & "<i>Equation</i>";
Case Else
Print #FileNum, CheckPara(NumPendingCR, WithinPara) & objChr;
End Select
Next
Set rng = rng.NextStoryRange
Loop
Next
If WithinPara Then
Print #FileNum, "</p>";
withpara = False
End If
Print #FileNum, vbCr & vbLf & "</body></html>"
Close FileNum
End Sub
Function CheckPara(ByRef NumPendingCR As Long, _
ByRef WithinPara As Boolean) As String
' Have a character to output. Check paragraph status, return
' necessary commands and adjust NumPendingCR and WithinPara.
Dim RtnValue As String
RtnValue = ""
If NumPendingCR = 0 Then
If Not WithinPara Then
CheckPara = "<p>"
WithinPara = True
Else
CheckPara = ""
End If
Exit Function
End If
If WithinPara And (NumPendingCR > 0) Then
' Terminate paragraph
RtnValue = "</p>"
NumPendingCR = NumPendingCR - 1
WithinPara = False
End If
Do While NumPendingCR > 1
' Replace each pair of CRs with an empty paragraph
RtnValue = RtnValue & "<p></p>"
NumPendingCR = NumPendingCR - 2
Loop
RtnValue = RtnValue & vbCr & vbLf & "<p>"
WithinPara = True
NumPendingCR = 0
CheckPara = RtnValue
End Function