DirectShow音频/视频PTS时钟计算

时间:2010-11-21 01:46:09

标签: video audio synchronization directshow

问候,

我编写了一个directshow源过滤器,它从WinTS / ARM视频处理器上编写的ATSC-153广播中获取AVC / AAC视频帧/ AAC访问单元。输出引脚(其中2个,一个用于视频,一个用于音频)连接到相应的解码器和渲染器。目前,我从适当的RTP头中获取PTS,并将它们传递给源过滤器并对directshow时钟执行计算。视频PTS的速率为90Khz,音频PTS速率不同,我目前的测试流音频滴答为55.2Khz。

以下是convert_to_dshow_timestamp()和FillBuffer()例程。当我通过过滤器检索视频/音频时打印出转换后的时间戳,时间在100-200ms之间。这不会是坏事,可以使用。但是,视频会将音频跟踪2-3秒。

/ *将时钟频率转换为directshow时钟速率的例程* / static unsigned long long convert_to_dshow_timestamp(                                                         unsigned long long ts,                                                         无符号长期利率                                                     ) {     长双hz;     长双倍;     long double tmp;

if (rate == 0)
{
    return 0;
}

hz = (long double) 1.0 / rate;
multi = hz / 1e-7;

tmp = ((long double) ts * multi) + 0.5;
return (unsigned long long) tmp;

}

/ *源过滤器FillBuffer()例程* / HRESULT OutputPin :: FillBuffer(IMediaSample * pSamp) {     BYTE * pData;     DWORD dataSize;     pipeStream流;     BOOL retVal;     DWORD returnBytes;     HRESULT小时;     DWORD discont;     REFERENCE_TIME ts;     REFERENCE_TIME df;     unsigned long long difPts;     unsigned long long difTimeRef;

pSamp->GetPointer(&pData);
dataSize = pSamp->GetSize();

ZeroMemory(pData, dataSize);

stream.lBuf = pData;
stream.dataSize = dataSize;

/* Pin type 1 is H.264 AVC video frames */
if (m_iPinType == 1)
{
    retVal = DeviceIoControl(
                                ghMHTune,
                                IOCTL_MHTUNE_RVIDEO_STREAM,
                                NULL,
                                0,
                                &stream,
                                sizeof(pipeStream),
                                &returnBytes,
                                NULL
                            );
    if (retVal == TRUE)
    {
        /* Get the data */
        /* Check for the first of the stream, if so, set the start time */
        pSamp->SetActualDataLength(returnBytes);
        hr = S_OK;
        if (returnBytes > 0)
        {
            /* The discontinuety is set in upper layers, when an RTP
             * sequence number has been lost.
             */
            discont = stream.discont;

            /* Check for another break in stream time */
            if (
                m_PrevTimeRef &&
                ((m_PrevTimeRef > (stream.timeRef + 90000 * 10)) ||
                ((m_PrevTimeRef + 90000 * 10) < stream.timeRef))
               )
            {
                dbg_log(TEXT("MY:DISC HERE\n"));
                 if (m_StartStream > 0)
                {
                    discont = 1;
                }
            }

            /* If the stream has not started yet, or there is a
             * discontinuety then reset the stream time.
             */
            if ((m_StartStream == 0) || (discont != 0))
            {
                sys_time = timeGetTime() - m_ClockStartTime;
                m_OtherSide->sys_time = sys_time;

                /* For Video, the clockRate is 90Khz */
                m_RefGap = (sys_time * (stream.clockRate / 1000)) +
                                                    (stream.clockRate / 2);

                /* timeRef is the PTS for the frame from the RTP header */
                m_TimeGap = stream.timeRef;
                m_StartStream = 1;
                difTimeRef = 1;
                m_PrevPTS = 0;
                m_PrevSysTime = timeGetTime();
                dbg_log(
                        TEXT("MY:StartStream %lld: %lld: %lld\n"),
                        sys_time,
                        m_RefGap,
                        m_TimeGap
                       );
            }
            else
            {
                m_StartStream++;
            }

            difTimeRef = stream.timeRef - m_PrevTimeRef;
            m_PrevTimeRef = stream.timeRef;

            /* Difference in 90 Khz clocking */
            ts = stream.timeRef - m_TimeGap + m_RefGap;
            ts = convert_to_dshow_timestamp(ts, stream.clockRate);

            if (discont != 0)
            {
                dbg_log(TEXT("MY:VDISC TRUE\n"));
                pSamp->SetDiscontinuity(TRUE);
            }
            else
            {
                pSamp->SetDiscontinuity(FALSE);
                pSamp->SetSyncPoint(TRUE);
            }

            difPts = ts - m_PrevPTS;

            df = ts + 1;
            m_PrevPTS = ts;
            dbg_log(
                    TEXT("MY:T %lld: %lld = %lld: %d: %lld\n"),
                    ts,
                    m_OtherSide->m_PrevPTS,
                    stream.timeRef,
                    (timeGetTime() - m_PrevSysTime),
                    difPts
                   );

            pSamp->SetTime(&ts, &df);
            m_PrevSysTime = timeGetTime();
        }
        else
        {
            Sleep(10);
        }
    }
    else
    {
        dbg_log(TEXT("MY:  Fill FAIL\n"));
        hr = E_FAIL;
    }
}
else if (m_iPinType == 2)
{
    /* Pin Type 2 is audio AAC Access units, with ADTS headers */
    retVal = DeviceIoControl(
                                ghMHTune,
                                IOCTL_MHTUNE_RAUDIO_STREAM,
                                NULL,
                                0,
                                &stream,
                                sizeof(pipeStream),
                                &returnBytes,
                                NULL
                            );

    if (retVal == TRUE)
    {
        /* Get the data */
        /* Check for the first of the stream, if so, set the start time */
        hr = S_OK;
        if (returnBytes > 0)
        {
            discont = stream.discont;
            if ((m_StartStream == 0) || (discont != 0))
            {
                sys_time = timeGetTime() - m_ClockStartTime;
                m_RefGap = (sys_time * (stream.clockRate / 1000)) +
                                                    (stream.clockRate / 2);

                /* Mark the first PTS from stream.  This PTS is from the
                 * RTP header, and is usually clocked differently than the
                 * video clock.
                 */
                m_TimeGap = stream.timeRef;
                m_StartStream = 1;
                difTimeRef = 1;
                m_PrevPTS = 0;
                m_PrevSysTime = timeGetTime();
                dbg_log(
                        TEXT("MY:AStartStream %lld: %lld: %lld\n"),
                        sys_time,
                        m_RefGap,
                        m_TimeGap
                       );
            }

            /* Let the video side stream in first before letting audio
             * start to flow.
             */
            if (m_OtherSide->m_StartStream < 32)
            {
                pSamp->SetActualDataLength(0);
                Sleep(10);
                return hr;
            }
            else
            {
                pSamp->SetActualDataLength(returnBytes);
            }

            difTimeRef = stream.timeRef - m_PrevTimeRef;
            m_PrevTimeRef = stream.timeRef;

            if (discont != 0)
            {
                dbg_log(TEXT("MY:ADISC TRUE\n"));
                pSamp->SetDiscontinuity(TRUE);
            }
            else
            {
                pSamp->SetDiscontinuity(FALSE);
                pSamp->SetSyncPoint(TRUE);
            }

            /* Difference in Audio PTS clock, TESTING AT 55.2 Khz */
            ts = stream.timeRef - m_TimeGap + m_RefGap;
            ts = convert_to_dshow_timestamp(ts, stream.clockRate);

            difPts = ts - m_PrevPTS;

            df = ts + 1;
            m_PrevPTS = ts;
            dbg_log(
                    TEXT("MY:AT %lld = %lld: %d: %lld\n"),
                    ts,
                    stream.timeRef,
                    (timeGetTime() - m_PrevSysTime),
                    difPts
                   );

            pSamp->SetTime(&ts, &df);
            m_PrevSysTime = timeGetTime();
        }
        else
        {
            pSamp->SetActualDataLength(0);
            Sleep(10);
        }
    }
}
return hr;

} / *代码结束* /

我尝试通过简单地添加(90000 * 10)来调整视频PTS,以查看视频是否会远远超过音频,但事实并非如此。视频仍会将音频跟踪2秒或更长时间。我真的不明白为什么这不起作用。每个视频帧应该提前10秒。这不正确吗?

他们的主要问题是,算法听起来基本上是什么?他们似乎可以独立运行视频/音频。

源过滤器不是推送过滤器,我不确定这是否会产生影响。我没有遇到解码器与广播输入不同步的问题。

非常感谢。

1 个答案:

答案 0 :(得分:3)

实际上我找出了问题,其中有两个问题。

第一个是对SPS H.264帧的糟糕工作。当解码器启动时,它会丢弃每一帧,直到它找到SPS帧。该流以每秒15帧的速度编码。这会使时间失效,因为解码器将在不到10ms的时间内消耗高达一秒的视频。之后呈现的每一帧都被认为是迟到的,它会尝试快进帧以赶上。作为一个实时源,它将再次耗尽帧。解决方法放在我前面的代码中,以确保有至少32帧的缓冲区,大约2秒。

第二个问题确实围绕着问题的根源。我使用RTP标头中的PTS作为时间参考。虽然这可以在单独的音频和/或视频情况下工作,但是不能保证视频RTP PTS将匹配相应的音频RTP PTS,并且通常不会。因此,根据规范使用RTCP NTP时间,根据规范:

PTS = RTCP_SR_NTP_timestamp + (RTP_timestamp - RTCP_SR_RTP_timestamp) / media_clock_rate

这允许我将实际视频PTS与相应的音频PTS相匹配。