如何根据C#中的文件头识别doc,docx,pdf,xls和xlsx? 我不想依赖文件扩展名MimeMapping.GetMimeMapping,因为这两者中的任何一个都可以被操纵。
我知道如何阅读标题,但不知道如果文件是doc,docx,pdf,xls或xlsx,哪些字节组合可以说。 有什么想法吗?
答案 0 :(得分:8)
此问题包含使用文件的第一个字节来确定文件类型的示例:Using .NET, how can you find the mime type of a file based on the file signature not the extension
这是一篇很长的帖子,所以我在下面发布相关答案:
public class MimeType
{
private static readonly byte[] BMP = { 66, 77 };
private static readonly byte[] DOC = { 208, 207, 17, 224, 161, 177, 26, 225 };
private static readonly byte[] EXE_DLL = { 77, 90 };
private static readonly byte[] GIF = { 71, 73, 70, 56 };
private static readonly byte[] ICO = { 0, 0, 1, 0 };
private static readonly byte[] JPG = { 255, 216, 255 };
private static readonly byte[] MP3 = { 255, 251, 48 };
private static readonly byte[] OGG = { 79, 103, 103, 83, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0 };
private static readonly byte[] PDF = { 37, 80, 68, 70, 45, 49, 46 };
private static readonly byte[] PNG = { 137, 80, 78, 71, 13, 10, 26, 10, 0, 0, 0, 13, 73, 72, 68, 82 };
private static readonly byte[] RAR = { 82, 97, 114, 33, 26, 7, 0 };
private static readonly byte[] SWF = { 70, 87, 83 };
private static readonly byte[] TIFF = { 73, 73, 42, 0 };
private static readonly byte[] TORRENT = { 100, 56, 58, 97, 110, 110, 111, 117, 110, 99, 101 };
private static readonly byte[] TTF = { 0, 1, 0, 0, 0 };
private static readonly byte[] WAV_AVI = { 82, 73, 70, 70 };
private static readonly byte[] WMV_WMA = { 48, 38, 178, 117, 142, 102, 207, 17, 166, 217, 0, 170, 0, 98, 206, 108 };
private static readonly byte[] ZIP_DOCX = { 80, 75, 3, 4 };
public static string GetMimeType(byte[] file, string fileName)
{
string mime = "application/octet-stream"; //DEFAULT UNKNOWN MIME TYPE
//Ensure that the filename isn't empty or null
if (string.IsNullOrWhiteSpace(fileName))
{
return mime;
}
//Get the file extension
string extension = Path.GetExtension(fileName) == null
? string.Empty
: Path.GetExtension(fileName).ToUpper();
//Get the MIME Type
if (file.Take(2).SequenceEqual(BMP))
{
mime = "image/bmp";
}
else if (file.Take(8).SequenceEqual(DOC))
{
mime = "application/msword";
}
else if (file.Take(2).SequenceEqual(EXE_DLL))
{
mime = "application/x-msdownload"; //both use same mime type
}
else if (file.Take(4).SequenceEqual(GIF))
{
mime = "image/gif";
}
else if (file.Take(4).SequenceEqual(ICO))
{
mime = "image/x-icon";
}
else if (file.Take(3).SequenceEqual(JPG))
{
mime = "image/jpeg";
}
else if (file.Take(3).SequenceEqual(MP3))
{
mime = "audio/mpeg";
}
else if (file.Take(14).SequenceEqual(OGG))
{
if (extension == ".OGX")
{
mime = "application/ogg";
}
else if (extension == ".OGA")
{
mime = "audio/ogg";
}
else
{
mime = "video/ogg";
}
}
else if (file.Take(7).SequenceEqual(PDF))
{
mime = "application/pdf";
}
else if (file.Take(16).SequenceEqual(PNG))
{
mime = "image/png";
}
else if (file.Take(7).SequenceEqual(RAR))
{
mime = "application/x-rar-compressed";
}
else if (file.Take(3).SequenceEqual(SWF))
{
mime = "application/x-shockwave-flash";
}
else if (file.Take(4).SequenceEqual(TIFF))
{
mime = "image/tiff";
}
else if (file.Take(11).SequenceEqual(TORRENT))
{
mime = "application/x-bittorrent";
}
else if (file.Take(5).SequenceEqual(TTF))
{
mime = "application/x-font-ttf";
}
else if (file.Take(4).SequenceEqual(WAV_AVI))
{
mime = extension == ".AVI" ? "video/x-msvideo" : "audio/x-wav";
}
else if (file.Take(16).SequenceEqual(WMV_WMA))
{
mime = extension == ".WMA" ? "audio/x-ms-wma" : "video/x-ms-wmv";
}
else if (file.Take(4).SequenceEqual(ZIP_DOCX))
{
mime = extension == ".DOCX" ? "application/vnd.openxmlformats-officedocument.wordprocessingml.document" : "application/x-zip-compressed";
}
return mime;
}
答案 1 :(得分:6)
使用文件签名并不是那么可行(因为新的办公室格式是ZIP文件而旧的Office文件是OLE CF / OLE SS容器),但是您可以使用C#代码来读取它们并找出它们是什么。
对于最新的Office格式,您可以使用System.IO.Packaging
阅读(DOCX / PPTX / XLSX / ...)ZIP文件:https://msdn.microsoft.com/en-us/library/ms568187(v=vs.110).aspx
这样做,您可以找到第一个文档部分的ContentType
并推断使用它。
对于较旧的Office文件(Office 2003),您可以使用此库根据其内容区分它们(请注意,MSI和MSG文件也使用此文件格式): http://sourceforge.net/projects/openmcdf/
我希望这有帮助! :)
如果我早些时候找到这个答案的话,那肯定对我有帮助。 ;)答案 2 :(得分:1)
来自user2173353的答案是最正确的答案,因为OP特别提到了Office文件格式。但是,我不想添加整个库(OpenMCDF)来识别传统的Office格式,所以我编写了自己的例程来做这件事。
public static CfbFileFormat GetCfbFileFormat(Stream fileData)
{
if (!fileData.CanSeek)
throw new ArgumentException("Data stream must be seekable.", nameof(fileData));
try
{
// Notice that values in a CFB files are always little-endian. Fortunately BinaryReader.ReadUInt16/ReadUInt32 reads with little-endian.
// If using .net < 4.5 this BinaryReader constructor is not available. Use a simpler one but remember to also remove the 'using' statement.
using (BinaryReader reader = new BinaryReader(fileData, Encoding.Unicode, true))
{
// Check that data has the CFB file header
var header = reader.ReadBytes(8);
if (!header.SequenceEqual(new byte[] {0xD0, 0xCF, 0x11, 0xE0, 0xA1, 0xB1, 0x1A, 0xE1}))
return CfbFileFormat.Unknown;
// Get sector size (2 byte uint) at offset 30 in the header
// Value at 1C specifies this as the power of two. The only valid values are 9 or 12, which gives 512 or 4096 byte sector size.
fileData.Position = 30;
ushort readUInt16 = reader.ReadUInt16();
int sectorSize = 1 << readUInt16;
// Get first directory sector index at offset 48 in the header
fileData.Position = 48;
var rootDirectoryIndex = reader.ReadUInt32();
// File header is one sector wide. After that we can address the sector directly using the sector index
var rootDirectoryAddress = sectorSize + (rootDirectoryIndex * sectorSize);
// Object type field is offset 80 bytes into the directory sector. It is a 128 bit GUID, encoded as "DWORD, WORD, WORD, BYTE[8]".
fileData.Position = rootDirectoryAddress + 80;
var bits127_96 = reader.ReadInt32();
var bits95_80 = reader.ReadInt16();
var bits79_64 = reader.ReadInt16();
var bits63_0 = reader.ReadBytes(8);
var guid = new Guid(bits127_96, bits95_80, bits79_64, bits63_0);
// Compare to known file format GUIDs
CfbFileFormat result;
return Formats.TryGetValue(guid, out result) ? result : CfbFileFormat.Unknown;
}
}
catch (IOException)
{
return CfbFileFormat.Unknown;
}
catch (OverflowException)
{
return CfbFileFormat.Unknown;
}
}
public enum CfbFileFormat
{
Doc,
Xls,
Msi,
Ppt,
Unknown
}
private static readonly Dictionary<Guid, CfbFileFormat> Formats = new Dictionary<Guid, CfbFileFormat>
{
{Guid.Parse("{00020810-0000-0000-c000-000000000046}"), CfbFileFormat.Xls},
{Guid.Parse("{00020820-0000-0000-c000-000000000046}"), CfbFileFormat.Xls},
{Guid.Parse("{00020906-0000-0000-c000-000000000046}"), CfbFileFormat.Doc},
{Guid.Parse("{000c1084-0000-0000-c000-000000000046}"), CfbFileFormat.Msi},
{Guid.Parse("{64818d10-4f9b-11cf-86ea-00aa00b929e8}"), CfbFileFormat.Ppt}
};
可以根据需要添加其他格式标识符。
我已经在.doc和.xls上尝试了这个,并且它运行良好。我使用4096字节的扇区大小对CFB文件进行了避免测试,因为我甚至不知道在哪里可以找到它们。
该代码基于以下文件中的信息:
答案 3 :(得分:0)
user2173353似乎是用于检测新的Office .docx / .xlsx格式的正确解决方案。 要向此添加一些详细信息,下面的检查似乎可以正确识别这些:
/// <summary>
/// MS .docx, .xslx and other extensions are (correctly) identified as zip files using signature lookup.
/// This tests if System.IO.Packaging is able to open, and if package has parts, this is not a zip file.
/// </summary>
/// <param name="stream"></param>
/// <returns></returns>
private static bool IsPackage(this Stream stream)
{
Package package = Package.Open(stream, FileMode.Open, FileAccess.Read);
return package.GetParts().Any();
}