您认为在给定字节序列开始的System.Stream中找到位置的最佳方法是什么(第一次出现):
public static long FindPosition(Stream stream, byte[] byteSequence)
{
long position = -1;
/// ???
return position;
}
P.S。最简单但最快速的解决方案是优先考虑的。 :)
答案 0 :(得分:5)
我已经达成了这个解决方案。
我使用3.050 KB
和38803 lines
的ASCII文件做了一些基准测试。
在文件的最后一行搜索byte
array
22 bytes
后,我的结果大约为2.28
秒(在慢/旧机器中)。< / p>
public static long FindPosition(Stream stream, byte[] byteSequence)
{
if (byteSequence.Length > stream.Length)
return -1;
byte[] buffer = new byte[byteSequence.Length];
using (BufferedStream bufStream = new BufferedStream(stream, byteSequence.Length))
{
int i;
while ((i = bufStream.Read(buffer, 0, byteSequence.Length)) == byteSequence.Length)
{
if (byteSequence.SequenceEqual(buffer))
return bufStream.Position - byteSequence.Length;
else
bufStream.Position -= byteSequence.Length - PadLeftSequence(buffer, byteSequence);
}
}
return -1;
}
private static int PadLeftSequence(byte[] bytes, byte[] seqBytes)
{
int i = 1;
while (i < bytes.Length)
{
int n = bytes.Length - i;
byte[] aux1 = new byte[n];
byte[] aux2 = new byte[n];
Array.Copy(bytes, i, aux1, 0, n);
Array.Copy(seqBytes, aux2, n);
if (aux1.SequenceEqual(aux2))
return i;
i++;
}
return i;
}
答案 1 :(得分:4)
如果您将流视为另一个字节序列,则可以像搜索字符串一样搜索它。 Wikipedia有一篇很棒的文章。 Boyer-Moore是一个很好的简单算法。
这是我用Java编写的一个快速黑客。如果不是Boyer-Moore,它的效果非常接近。希望它有所帮助;)
public static final int BUFFER_SIZE = 32;
public static int [] buildShiftArray(byte [] byteSequence){
int [] shifts = new int[byteSequence.length];
int [] ret;
int shiftCount = 0;
byte end = byteSequence[byteSequence.length-1];
int index = byteSequence.length-1;
int shift = 1;
while(--index >= 0){
if(byteSequence[index] == end){
shifts[shiftCount++] = shift;
shift = 1;
} else {
shift++;
}
}
ret = new int[shiftCount];
for(int i = 0;i < shiftCount;i++){
ret[i] = shifts[i];
}
return ret;
}
public static byte [] flushBuffer(byte [] buffer, int keepSize){
byte [] newBuffer = new byte[buffer.length];
for(int i = 0;i < keepSize;i++){
newBuffer[i] = buffer[buffer.length - keepSize + i];
}
return newBuffer;
}
public static int findBytes(byte [] haystack, int haystackSize, byte [] needle, int [] shiftArray){
int index = needle.length;
int searchIndex, needleIndex, currentShiftIndex = 0, shift;
boolean shiftFlag = false;
index = needle.length;
while(true){
needleIndex = needle.length-1;
while(true){
if(index >= haystackSize)
return -1;
if(haystack[index] == needle[needleIndex])
break;
index++;
}
searchIndex = index;
needleIndex = needle.length-1;
while(needleIndex >= 0 && haystack[searchIndex] == needle[needleIndex]){
searchIndex--;
needleIndex--;
}
if(needleIndex < 0)
return index-needle.length+1;
if(shiftFlag){
shiftFlag = false;
index += shiftArray[0];
currentShiftIndex = 1;
} else if(currentShiftIndex >= shiftArray.length){
shiftFlag = true;
index++;
} else{
index += shiftArray[currentShiftIndex++];
}
}
}
public static int findBytes(InputStream stream, byte [] needle){
byte [] buffer = new byte[BUFFER_SIZE];
int [] shiftArray = buildShiftArray(needle);
int bufferSize, initBufferSize;
int offset = 0, init = needle.length;
int val;
try{
while(true){
bufferSize = stream.read(buffer, needle.length-init, buffer.length-needle.length+init);
if(bufferSize == -1)
return -1;
if((val = findBytes(buffer, bufferSize+needle.length-init, needle, shiftArray)) != -1)
return val+offset;
buffer = flushBuffer(buffer, needle.length);
offset += bufferSize-init;
init = 0;
}
} catch (IOException e){
e.printStackTrace();
}
return -1;
}
答案 2 :(得分:3)
你基本上需要保持一个与byteSequence
大小相同的缓冲区,这样一旦你发现流中的“下一个字节”匹配,你可以检查其余的但是仍然会回到如果它不是真正的匹配,则为“next but one”字节。
老实说,无论你做什么,都可能有点繁琐:(
答案 3 :(得分:2)
我需要自己做,已经开始,并且不喜欢上面的解决方案。我特别需要找到搜索字节序列结束的位置。在我的情况下,我需要快速转发流,直到该字节序列之后。但你也可以使用我的解决方案来解决这个问题:
var afterSequence = stream.ScanUntilFound(byteSequence);
var beforeSequence = afterSequence - byteSequence.Length;
这是StreamExtensions.cs
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace System
{
static class StreamExtensions
{
/// <summary>
/// Advances the supplied stream until the given searchBytes are found, without advancing too far (consuming any bytes from the stream after the searchBytes are found).
/// Regarding efficiency, if the stream is network or file, then MEMORY/CPU optimisations will be of little consequence here.
/// </summary>
/// <param name="stream">The stream to search in</param>
/// <param name="searchBytes">The byte sequence to search for</param>
/// <returns></returns>
public static int ScanUntilFound(this Stream stream, byte[] searchBytes)
{
// For this class code comments, a common example is assumed:
// searchBytes are {1,2,3,4} or 1234 for short
// # means value that is outside of search byte sequence
byte[] streamBuffer = new byte[searchBytes.Length];
int nextRead = searchBytes.Length;
int totalScannedBytes = 0;
while (true)
{
FillBuffer(stream, streamBuffer, nextRead);
totalScannedBytes += nextRead; //this is only used for final reporting of where it was found in the stream
if (ArraysMatch(searchBytes, streamBuffer, 0))
return totalScannedBytes; //found it
nextRead = FindPartialMatch(searchBytes, streamBuffer);
}
}
/// <summary>
/// Check all offsets, for partial match.
/// </summary>
/// <param name="searchBytes"></param>
/// <param name="streamBuffer"></param>
/// <returns>The amount of bytes which need to be read in, next round</returns>
static int FindPartialMatch(byte[] searchBytes, byte[] streamBuffer)
{
// 1234 = 0 - found it. this special case is already catered directly in ScanUntilFound
// #123 = 1 - partially matched, only missing 1 value
// ##12 = 2 - partially matched, only missing 2 values
// ###1 = 3 - partially matched, only missing 3 values
// #### = 4 - not matched at all
for (int i = 1; i < searchBytes.Length; i++)
{
if (ArraysMatch(searchBytes, streamBuffer, i))
{
// EG. Searching for 1234, have #123 in the streamBuffer, and [i] is 1
// Output: 123#, where # will be read using FillBuffer next.
Array.Copy(streamBuffer, i, streamBuffer, 0, searchBytes.Length - i);
return i; //if an offset of [i], makes a match then only [i] bytes need to be read from the stream to check if there's a match
}
}
return 4;
}
/// <summary>
/// Reads bytes from the stream, making sure the requested amount of bytes are read (streams don't always fulfill the full request first time)
/// </summary>
/// <param name="stream">The stream to read from</param>
/// <param name="streamBuffer">The buffer to read into</param>
/// <param name="bytesNeeded">How many bytes are needed. If less than the full size of the buffer, it fills the tail end of the streamBuffer</param>
static void FillBuffer(Stream stream, byte[] streamBuffer, int bytesNeeded)
{
// EG1. [123#] - bytesNeeded is 1, when the streamBuffer contains first three matching values, but now we need to read in the next value at the end
// EG2. [####] - bytesNeeded is 4
var bytesAlreadyRead = streamBuffer.Length - bytesNeeded; //invert
while (bytesAlreadyRead < streamBuffer.Length)
{
bytesAlreadyRead += stream.Read(streamBuffer, bytesAlreadyRead, streamBuffer.Length - bytesAlreadyRead);
}
}
/// <summary>
/// Checks if arrays match exactly, or with offset.
/// </summary>
/// <param name="searchBytes">Bytes to search for. Eg. [1234]</param>
/// <param name="streamBuffer">Buffer to match in. Eg. [#123] </param>
/// <param name="startAt">When this is zero, all bytes are checked. Eg. If this value 1, and it matches, this means the next byte in the stream to read may mean a match</param>
/// <returns></returns>
static bool ArraysMatch(byte[] searchBytes, byte[] streamBuffer, int startAt)
{
for (int i = 0; i < searchBytes.Length - startAt; i++)
{
if (searchBytes[i] != streamBuffer[i + startAt])
return false;
}
return true;
}
}
}
答案 4 :(得分:1)
有点老问题,但这是我的答案。我发现阅读块然后在那里搜索是非常低效的,而不是一次只读一个并从那里开始。
另外,IIRC,如果序列的一部分在一个块读取而一半在另一个块中,则接受的答案将失败 - 例如,给定12345,搜索23,它将读取12,不匹配,然后读取34,不匹配等等......但是没有尝试过,因为它需要净4.0。无论如何,这更简单,而且可能更快。
static long ReadOneSrch(Stream haystack, byte[] needle)
{
int b;
long i = 0;
while ((b = haystack.ReadByte()) != -1)
{
if (b == needle[i++])
{
if (i == needle.Length)
return haystack.Position - needle.Length;
}
else
i = b == needle[0] ? 1 : 0;
}
return -1;
}
答案 5 :(得分:0)
static long Search(Stream stream, byte[] pattern)
{
long start = -1;
stream.Seek(0, SeekOrigin.Begin);
while(stream.Position < stream.Length)
{
if (stream.ReadByte() != pattern[0])
continue;
start = stream.Position - 1;
for (int idx = 1; idx < pattern.Length; idx++)
{
if (stream.ReadByte() != pattern[idx])
{
start = -1;
break;
}
}
if (start > -1)
{
return start;
}
}
return start;
}