我在C#中解析.doc文件以提取文本。
问题是.doc文件包含表格,形状和图像,以及我的文本。我使用Microsoft Interop Word库来提取文本。当我提取文本时,我还获得了形状和图像上的标签,以及表格的列和行内的数据。
我不需要表格或图像中的形状或数据标签。如何从我的.doc文件中删除这些形状,标签,图像和表格?
这是代码。
public void ReadMsWord()
{
// variable to store file path
string filePath = null;
// open dialog box to select file
OpenFileDialog file = new OpenFileDialog();
// dilog box title name
file.Title = "Word File";
// set initial directory of computer system
file.InitialDirectory = "c:\\";
// set restore directory
file.RestoreDirectory = true;
// execute if block when dialog result box click ok button
if (file.ShowDialog() == DialogResult.OK)
{
// store selected file path
filePath = file.FileName.ToString();
}
try
{
// create word application
Microsoft.Office.Interop.Word.Application word = new Microsoft.Office.Interop.Word.ApplicationClass();
// create object of missing value
object miss = System.Reflection.Missing.Value;
// create object of selected file path
object path = filePath;
// set file path mode
object readOnly = false;
// open document
Microsoft.Office.Interop.Word.Document docs = word.Documents.Open(ref path, ref miss, ref readOnly, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss);
// select whole data from active window document
docs.ActiveWindow.Selection.WholeStory();
// handover the data to cllipboard
docs.ActiveWindow.Selection.Copy();
// clipboard create reference of idataobject interface which transfer the data
IDataObject data = Clipboard.GetDataObject();
//set data into richtextbox control in text format
string t = "";
string[] y = {};
t = data.GetData(DataFormats.Text).ToString();
y = t.Split('\n');
string check = "";
string check1 = "";
string A = "";
//int i = 0;
for (int i = 0; i < y.Length - 1; i++)
{
if (!Regex.IsMatch(y[i], @"^([A|B|C|D]| )")&&(y[i].Length>15))
{
//@"^\d+"
//int j = i + 1;
// while (!Regex.IsMatch(asdf[j], @"^[A|B|C]"))
// {
check = check + '\n'+'\n' + y[i];
//i++;
}
}
for (int i = 0; i < y.Length - 1; i++)
{
if (Regex.IsMatch(y[i], @"^[A |B |C |D ]"))
{
//@"^\d+"
//int j = i + 1;
// while (!Regex.IsMatch(asdf[j], @"^[A|B|C]"))
// {
check1 = check1 + '\n' + '\n' + y[i];
//i++;
}
}
代码@theGhostofc这是我正在使用的代码,它提供异常类型错误匹配类型
private void button1_Click(object sender, EventArgs e)
{
string filePath = null;
// open dialog box to select file
OpenFileDialog file = new OpenFileDialog();
// dilog box title name
file.Title = "Word File";
// set initial directory of computer system
file.InitialDirectory = "c:\\";
// set restore directory
file.RestoreDirectory = true;
// execute if block when dialog result box click ok button
if (file.ShowDialog() == DialogResult.OK)
{
// store selected file path
filePath = file.FileName.ToString();
}
try
{
// create word application
Microsoft.Office.Interop.Word.Application word = new
Microsoft.Office.Interop.Word.Application();
// create object of missing value
object miss = System.Reflection.Missing.Value;
// create object of selected file path
object path = filePath;
// set file path mode
object readOnly = false;
// open document
Microsoft.Office.Interop.Word.Document docs = word.Documents.Open(ref path,
ref miss, ref readOnly,
ref miss,
ref miss, ref miss, ref miss,
ref miss,
ref miss, ref miss, ref miss,
ref miss,
ref miss, ref miss, ref miss,
ref miss);
foreach (Microsoft.Office.Interop.Word.Table tbl in docs.Tables)
{
tbl.Delete();
}
foreach (Microsoft.Office.Interop.Word.Shape shp in docs.Shapes)
{
shp.Delete();
}
foreach (Microsoft.Office.Interop.Word.InlineShape ilshp in
docs.InlineShapes)
{
if (ilshp.Type ==
Microsoft.Office.Interop.Word.WdInlineShapeType.wdInlineShapePicture)
ilshp.Delete();
}
docs.Close(ref path,ref readOnly,ref miss);
}
catch (Exception ex)
{
MessageBox.Show(ex.Message);
}
}
@theghostofc代码
private void button1_Click(object sender, EventArgs e)
{
// variable to store file path
string filePath = null;
// open dialog box to select file
OpenFileDialog file = new OpenFileDialog();
// dilog box title name
file.Title = "Word File";
// set initial directory of computer system
file.InitialDirectory = "c:\\";
// set restore directory
file.RestoreDirectory = true;
// execute if block when dialog result box click ok button
if (file.ShowDialog() == DialogResult.OK)
{
// store selected file path
filePath = file.FileName.ToString();
}
Microsoft.Office.Interop.Word.Application word = new Microsoft.Office.Interop.Word.ApplicationClass();
// create object of missing value
object miss = System.Reflection.Missing.Value;
// create object of selected file path
object path = filePath;
// set file path mode
object readOnly = false;
// open document
Microsoft.Office.Interop.Word.Document docs = word.Documents.Open(ref path, ref miss, ref readOnly, ref miss,
ref miss, ref miss, ref miss, ref miss,
ref miss, ref miss, ref miss, ref miss,
ref miss, ref miss, ref miss, ref miss);
try
{
// create word application
foreach (Microsoft.Office.Interop.Word.Table tbl in docs.Tables)
{
tbl.Delete();
}
foreach (Microsoft.Office.Interop.Word.Shape shp in docs.Shapes)
{
shp.Delete();
}
foreach (Microsoft.Office.Interop.Word.InlineShape ilshp in docs.InlineShapes)
{
if (ilshp.Type == Microsoft.Office.Interop.Word.WdInlineShapeType.wdInlineShapePicture)
ilshp.Delete();
}
object saveOption = Microsoft.Office.Interop.Word.WdSaveOptions.wdDoNotSaveChanges;
object originalFormat = Microsoft.Office.Interop.Word.WdOriginalFormat.wdOriginalDocumentFormat;
object routeDocument = false;
docs.Close(ref saveOption, ref originalFormat, ref routeDocument);
// docs.Close(ref miss, ref miss, ref miss);
docs = null;
GC.Collect();
}
catch (Exception ex)
{
MessageBox.Show(ex.Message);
}
}
这是代码请告诉我这两种方法都要求参数docs.Save()
和docs.Close()
。我已将参数传递给docs.Close()
,但我不知道应该将哪些参数传递给docs.Save()
,所以我没有在我的代码中使用docs.Save()
所以plz明确地定义了这些我将会是很高兴你
答案 0 :(得分:8)
如果你想要的只是从单词doc中删除表格,形状和图像,你可以尝试以下代码片段:
try
{
// create word application
Microsoft.Office.Interop.Word.Application word = new Microsoft.Office.Interop.Word.ApplicationClass();
// create object of missing value
object miss = System.Reflection.Missing.Value;
// create object of selected file path
object path = filePath;
// set file path mode
object readOnly = false;
// open document
Microsoft.Office.Interop.Word.Document docs = word.Documents.Open(ref path, ref miss, ref readOnly, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss, ref miss);
foreach (Microsoft.Office.Interop.Word.Table tbl in docs.Tables)
{
tbl.Delete();
}
foreach (Microsoft.Office.Interop.Word.Shape shp in docs.Shapes)
{
shp.Delete();
}
foreach (Microsoft.Office.Interop.Word.InlineShape ilshp in docs.InlineShapes)
{
if (ilshp.Type == Microsoft.Office.Interop.Word.WdInlineShapeType.wdInlineShapePicture)
ilshp.Delete();
}
docs.Close();
}
完成后,您可以保存docs
。
如果您想删除更多对象,可能会看到Microsoft.Office.Interop.Word.WdInlineShapeType
的更多选项,这样也可以删除已关联的图片。
P.S。:这不是复制粘贴代码。请仅以此为出发点。