点燃CorruptedTreeException导致群集故障

时间:2018-12-30 15:56:47

标签: ignite

我在Kubernetes(使用NVMe驱动器的AWS i3实例)上运行了一组Ignite Pod。当从Spark流式传输数据时,由于此错误消息,它们崩溃的频率越来越高,我希望它们发生:

Person.findOne(...

当我重新启动Pod时,我的Ignite群集无法重新启动,并且收到以下错误消息:

[SEVERE][data-streamer-stripe-2-#15][GridDhtAtomicCache] <MyCache> Unexpected exception during cache update
org.h2.message.DbException: General error: "class org.apache.ignite.internal.processors.cache.persistence.tree.CorruptedTreeException: Runtime failure on row: Row@75ab6623[ key: CacheKey [idHash=242632156, hash=-841684964, parentId=-8607237606486310912, hour=9, id=-8607237528489033728, date=2018-09-09 00:00:00.0], val: CacheValue [idHash=843227122, hash=-801894604, theta=[2, 3, 3, 0, 0, 26, -52, -109, 11, 0, 0, 0, 0, 0, -128, 63, -37, 46, -109, -44, -43, -122, 18, 0, 122, -52, 59, -110, 53, 32, -44, 6, -98, 53, 74, 8, 21, -22, -36, 34, 84, 91, -95, -30, -55, 14, 107, 51, -60, -21, -31, 123, 124, 77, -17, 57, 80, 47, -20, 115, 107, 53, -121, 64, -24, -50, -24, 95, -123, -76, 1, 80, 14, 43, -118, -64, 94, 43, 109, 84, -106, -114, -94, 15, 98, -118, -12, 89, -76, 68, 119, -20, 10, 112, 7, 120, -120, -102, 122, 56, 126, 63, 94, 120]], ver: GridCacheVersion [topVer=157608673, order=1546229366985, nodeOrder=4] ][ 2018-09-09 00:00:00, -8607237528489033728, 9, -8607237606486310912, 02030300001acc930b0000000000803fdb2e93d4d58612007acc3b923520d4069e354a0815eadc22545ba1e2c90e6b33c4ebe17b7c4def39502fec736b358740e8cee85f85b401500e2b8ac05e2b6d54968ea20f628af459b44477ec0a700778889a7a387e3f5e78 ]" [50000-197]
    at org.h2.message.DbException.get(DbException.java:168)
    at org.h2.message.DbException.convert(DbException.java:307)
    at org.apache.ignite.internal.processors.query.h2.database.H2TreeIndex.putx(H2TreeIndex.java:302)
    at org.apache.ignite.internal.processors.query.h2.opt.GridH2Table.addToIndex(GridH2Table.java:546)
    at org.apache.ignite.internal.processors.query.h2.opt.GridH2Table.update(GridH2Table.java:479)
    at org.apache.ignite.internal.processors.query.h2.IgniteH2Indexing.store(IgniteH2Indexing.java:768)
    at org.apache.ignite.internal.processors.query.GridQueryProcessor.store(GridQueryProcessor.java:1905)
    at org.apache.ignite.internal.processors.cache.query.GridCacheQueryManager.store(GridCacheQueryManager.java:404)
    at org.apache.ignite.internal.processors.cache.IgniteCacheOffheapManagerImpl$CacheDataStoreImpl.finishUpdate(IgniteCacheOffheapManagerImpl.java:2633)
    at org.apache.ignite.internal.processors.cache.IgniteCacheOffheapManagerImpl$CacheDataStoreImpl.invoke0(IgniteCacheOffheapManagerImpl.java:1646)
    at org.apache.ignite.internal.processors.cache.IgniteCacheOffheapManagerImpl$CacheDataStoreImpl.invoke(IgniteCacheOffheapManagerImpl.java:1621)
    at org.apache.ignite.internal.processors.cache.persistence.GridCacheOffheapManager$GridCacheDataStore.invoke(GridCacheOffheapManager.java:1935)
    at org.apache.ignite.internal.processors.cache.IgniteCacheOffheapManagerImpl.invoke(IgniteCacheOffheapManagerImpl.java:428)
    at org.apache.ignite.internal.processors.cache.GridCacheMapEntry.innerUpdate(GridCacheMapEntry.java:2295)
    at org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridDhtAtomicCache.updateSingle(GridDhtAtomicCache.java:2494)
    at org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridDhtAtomicCache.update(GridDhtAtomicCache.java:1951)
    at org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridDhtAtomicCache.updateAllAsyncInternal0(GridDhtAtomicCache.java:1780)
    at org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridDhtAtomicCache.updateAllAsyncInternal(GridDhtAtomicCache.java:1668)
    at org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridNearAtomicAbstractUpdateFuture.sendSingleRequest(GridNearAtomicAbstractUpdateFuture.java:299)
    at org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridNearAtomicSingleUpdateFuture.map(GridNearAtomicSingleUpdateFuture.java:483)
    at org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridNearAtomicSingleUpdateFuture.mapOnTopology(GridNearAtomicSingleUpdateFuture.java:443)
    at org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridNearAtomicAbstractUpdateFuture.map(GridNearAtomicAbstractUpdateFuture.java:248)
    at org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridDhtAtomicCache.update0(GridDhtAtomicCache.java:1153)
    at org.apache.ignite.internal.processors.cache.distributed.dht.atomic.GridDhtAtomicCache.put0(GridDhtAtomicCache.java:611)
    at org.apache.ignite.internal.processors.cache.GridCacheAdapter.put(GridCacheAdapter.java:2449)
    at org.apache.ignite.internal.processors.cache.GridCacheAdapter.put(GridCacheAdapter.java:2426)
    at org.apache.ignite.internal.processors.cache.IgniteCacheProxyImpl.put(IgniteCacheProxyImpl.java:1105)
    at org.apache.ignite.internal.processors.cache.GatewayProtectedCacheProxy.put(GatewayProtectedCacheProxy.java:820)
    at org.apache.ignite.internal.processors.datastreamer.DataStreamerCacheUpdaters$Individual.receive(DataStreamerCacheUpdaters.java:121)
    at org.apache.ignite.internal.processors.datastreamer.DataStreamerUpdateJob.call(DataStreamerUpdateJob.java:140)
    at org.apache.ignite.internal.processors.datastreamer.DataStreamProcessor.localUpdate(DataStreamProcessor.java:400)
    at org.apache.ignite.internal.processors.datastreamer.DataStreamProcessor.processRequest(DataStreamProcessor.java:305)
    at org.apache.ignite.internal.processors.datastreamer.DataStreamProcessor.access$000(DataStreamProcessor.java:60)
    at org.apache.ignite.internal.processors.datastreamer.DataStreamProcessor$1.onMessage(DataStreamProcessor.java:90)
    at org.apache.ignite.internal.managers.communication.GridIoManager.invokeListener(GridIoManager.java:1569)
    at org.apache.ignite.internal.managers.communication.GridIoManager.processRegularMessage0(GridIoManager.java:1197)
    at org.apache.ignite.internal.managers.communication.GridIoManager.access$4200(GridIoManager.java:127)
    at org.apache.ignite.internal.managers.communication.GridIoManager$9.run(GridIoManager.java:1093)
    at org.apache.ignite.internal.util.StripedExecutor$Stripe.body(StripedExecutor.java:505)
    at org.apache.ignite.internal.util.worker.GridWorker.run(GridWorker.java:120)
    at java.lang.Thread.run(Thread.java:748)
Caused by: org.h2.jdbc.JdbcSQLException: General error: "class org.apache.ignite.internal.processors.cache.persistence.tree.CorruptedTreeException: Runtime failure on row: Row@75ab6623[ key: CacheKey [idHash=242632156, hash=-841684964, parentId=-8607237606486310912, hour=9, id=-8607237528489033728, date=2018-09-09 00:00:00.0], val: CacheValue [idHash=843227122, hash=-801894604, theta=[2, 3, 3, 0, 0, 26, -52, -109, 11, 0, 0, 0, 0, 0, -128, 63, -37, 46, -109, -44, -43, -122, 18, 0, 122, -52, 59, -110, 53, 32, -44, 6, -98, 53, 74, 8, 21, -22, -36, 34, 84, 91, -95, -30, -55, 14, 107, 51, -60, -21, -31, 123, 124, 77, -17, 57, 80, 47, -20, 115, 107, 53, -121, 64, -24, -50, -24, 95, -123, -76, 1, 80, 14, 43, -118, -64, 94, 43, 109, 84, -106, -114, -94, 15, 98, -118, -12, 89, -76, 68, 119, -20, 10, 112, 7, 120, -120, -102, 122, 56, 126, 63, 94, 120]], ver: GridCacheVersion [topVer=157608673, order=1546229366985, nodeOrder=4] ][ 2018-09-09 00:00:00, -8607237528489033728, 9, -8607237606486310912, 02030300001acc930b0000000000803fdb2e93d4d58612007acc3b923520d4069e354a0815eadc22545ba1e2c90e6b33c4ebe17b7c4def39502fec736b358740e8cee85f85b401500e2b8ac05e2b6d54968ea20f628af459b44477ec0a700778889a7a387e3f5e78 ]" [50000-197]
    at org.h2.message.DbException.getJdbcSQLException(DbException.java:357)
    ... 41 more
Caused by: class org.apache.ignite.internal.processors.cache.persistence.tree.CorruptedTreeException: Runtime failure on row: Row@75ab6623[ key: CacheKey [idHash=242632156, hash=-841684964, parentId=-8607237606486310912, hour=9, id=-8607237528489033728, date=2018-09-09 00:00:00.0], val: CacheValue [idHash=843227122, hash=-801894604, theta=[2, 3, 3, 0, 0, 26, -52, -109, 11, 0, 0, 0, 0, 0, -128, 63, -37, 46, -109, -44, -43, -122, 18, 0, 122, -52, 59, -110, 53, 32, -44, 6, -98, 53, 74, 8, 21, -22, -36, 34, 84, 91, -95, -30, -55, 14, 107, 51, -60, -21, -31, 123, 124, 77, -17, 57, 80, 47, -20, 115, 107, 53, -121, 64, -24, -50, -24, 95, -123, -76, 1, 80, 14, 43, -118, -64, 94, 43, 109, 84, -106, -114, -94, 15, 98, -118, -12, 89, -76, 68, 119, -20, 10, 112, 7, 120, -120, -102, 122, 56, 126, 63, 94, 120]], ver: GridCacheVersion [topVer=157608673, order=1546229366985, nodeOrder=4] ][ 2018-09-09 00:00:00, -8607237528489033728, 9, -8607237606486310912, 02030300001acc930b0000000000803fdb2e93d4d58612007acc3b923520d4069e354a0815eadc22545ba1e2c90e6b33c4ebe17b7c4def39502fec736b358740e8cee85f85b401500e2b8ac05e2b6d54968ea20f628af459b44477ec0a700778889a7a387e3f5e78 ]
    at org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTree.doPut(BPlusTree.java:2285)
    at org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTree.putx(BPlusTree.java:2232)
    at org.apache.ignite.internal.processors.query.h2.database.H2TreeIndex.putx(H2TreeIndex.java:299)
    ... 38 more
Caused by: java.lang.IllegalStateException: Failed to get page IO instance (page content is corrupted)
    at org.apache.ignite.internal.processors.cache.persistence.tree.io.IOVersions.forVersion(IOVersions.java:85)
    at org.apache.ignite.internal.processors.cache.persistence.tree.io.IOVersions.forPage(IOVersions.java:97)
    at org.apache.ignite.internal.processors.cache.persistence.CacheDataRowAdapter.initFromLink(CacheDataRowAdapter.java:154)
    at org.apache.ignite.internal.processors.cache.persistence.CacheDataRowAdapter.initFromLink(CacheDataRowAdapter.java:108)
    at org.apache.ignite.internal.processors.query.h2.database.H2RowFactory.getRow(H2RowFactory.java:62)
    at org.apache.ignite.internal.processors.query.h2.database.H2Tree.createRowFromLink(H2Tree.java:217)
    at org.apache.ignite.internal.processors.query.h2.database.io.AbstractH2ExtrasInnerIO.getLookupRow(AbstractH2ExtrasInnerIO.java:141)
    at org.apache.ignite.internal.processors.query.h2.database.io.AbstractH2ExtrasInnerIO.getLookupRow(AbstractH2ExtrasInnerIO.java:36)
    at org.apache.ignite.internal.processors.query.h2.database.H2Tree.getRow(H2Tree.java:248)
    at org.apache.ignite.internal.processors.query.h2.database.H2Tree.getRow(H2Tree.java:55)
    at org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTree.getRow(BPlusTree.java:5118)
    at org.apache.ignite.internal.processors.query.h2.database.H2Tree.compare(H2Tree.java:327)
    at org.apache.ignite.internal.processors.query.h2.database.H2Tree.compare(H2Tree.java:55)
    at org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTree.compare(BPlusTree.java:5105)
    at org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTree.findInsertionPoint(BPlusTree.java:5025)
    at org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTree.access$1300(BPlusTree.java:90)
    at org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTree$Search.run0(BPlusTree.java:291)
    at org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTree$GetPageHandler.run(BPlusTree.java:5615)
    at org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTree$Search.run(BPlusTree.java:271)
    at org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTree$GetPageHandler.run(BPlusTree.java:5600)
    at org.apache.ignite.internal.processors.cache.persistence.tree.util.PageHandler.readPage(PageHandler.java:159)
    at org.apache.ignite.internal.processors.cache.persistence.DataStructure.read(DataStructure.java:334)
    at org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTree.putDown(BPlusTree.java:2499)
    at org.apache.ignite.internal.processors.cache.persistence.tree.BPlusTree.doPut(BPlusTree.java:2252)
    ... 40 more

我有两个问题:

  1. 从该错误中恢复数据丢失最少的方法是什么?
  2. 我可以采取哪些措施来防止这些CorruptedTreeException将来发生。

编辑:

点燃火花:

[SEVERE][exchange-worker-#53][GridDhtPartitionsExchangeFuture] Failed to reinitialize local partitions (rebalancing will be stopped): GridDhtPartitionExchangeId [topVer=AffinityTopologyVersion [topVer=5, minorTopVer=1], discoEvt=DiscoveryCustomEvent [customMsg=ChangeGlobalStateMessage [id=39389cff761-e26a5564-3e0b-40db-8d23-9b55baed8cb4, reqId=f7368209-3e64-4889-8125-11533195d465, initiatingNodeId=b88e2e48-9691-4ed7-975d-b638832a73d8, activate=true, baselineTopology=BaselineTopology [id=0, branchingHash=-518711786, branchingType='New BaselineTopology', baselineNodes=[2f1bae9c-b5c7-4255-b8b4-aca5fb462543, 29408921-5163-47af-9e14-cd0c83858ca5, 32bd1ab0-8060-478b-a3b9-f1d1c93d6e54, 7b893817-fc74-451d-987e-74f9d65956f1]], forceChangeBaselineTopology=false, timestamp=1546184663789], affTopVer=AffinityTopologyVersion [topVer=5, minorTopVer=1], super=DiscoveryEvent [evtNode=TcpDiscoveryNode [id=b88e2e48-9691-4ed7-975d-b638832a73d8, addrs=[0:0:0:0:0:0:0:1%lo, 100.96.2.1, 127.0.0.1, 172.17.0.1, 172.25.40.134], sockAddrs=[/172.25.40.134:47500, ip-172-17-0-1.ec2.internal/172.17.0.1:47500, /100.96.2.1:47500, /0:0:0:0:0:0:0:1%lo:47500, /127.0.0.1:47500], discPort=47500, order=5, intOrder=5, lastExchangeTime=1546184663653, loc=false, ver=2.7.0#20181130-sha1:256ae401, isClient=false], topVer=5, nodeId8=4fa842da, msg=null, type=DISCOVERY_CUSTOM_EVT, tstamp=1546184663805]], nodeId=b88e2e48, evt=DISCOVERY_CUSTOM_EVT]
class org.apache.ignite.IgniteCheckedException: Unknown page IO type: 0
    at org.apache.ignite.internal.processors.cache.persistence.tree.io.PageIO.getBPlusIO(PageIO.java:699)
    at org.apache.ignite.internal.processors.cache.persistence.tree.io.PageIO.getPageIO(PageIO.java:575)
    at org.apache.ignite.internal.processors.cache.persistence.tree.io.PageIO.getPageIO(PageIO.java:535)
    at org.apache.ignite.internal.processors.cache.persistence.tree.util.PageHandler.writePage(PageHandler.java:277)
    at org.apache.ignite.internal.processors.cache.persistence.DataStructure.write(DataStructure.java:296)
    at org.apache.ignite.internal.processors.cache.persistence.freelist.AbstractFreeList.insertDataRow(AbstractFreeList.java:500)
    at org.apache.ignite.internal.processors.cache.persistence.freelist.CacheFreeListImpl.insertDataRow(CacheFreeListImpl.java:59)
    at org.apache.ignite.internal.processors.cache.persistence.freelist.CacheFreeListImpl.insertDataRow(CacheFreeListImpl.java:35)
    at org.apache.ignite.internal.processors.cache.persistence.RowStore.addRow(RowStore.java:103)
    at org.apache.ignite.internal.processors.cache.IgniteCacheOffheapManagerImpl$CacheDataStoreImpl.update(IgniteCacheOffheapManagerImpl.java:2508)
    at org.apache.ignite.internal.processors.cache.persistence.GridCacheOffheapManager$GridCacheDataStore.update(GridCacheOffheapManager.java:1759)
    at org.apache.ignite.internal.processors.cache.IgniteCacheOffheapManagerImpl.update(IgniteCacheOffheapManagerImpl.java:443)
    at org.apache.ignite.internal.processors.cache.persistence.GridCacheDatabaseSharedManager.applyUpdate(GridCacheDatabaseSharedManager.java:2653)
    at org.apache.ignite.internal.processors.cache.persistence.GridCacheDatabaseSharedManager.applyLastUpdates(GridCacheDatabaseSharedManager.java:2339)
    at org.apache.ignite.internal.processors.cache.persistence.GridCacheDatabaseSharedManager.restoreState(GridCacheDatabaseSharedManager.java:1628)
    at org.apache.ignite.internal.processors.cache.persistence.GridCacheDatabaseSharedManager.beforeExchange(GridCacheDatabaseSharedManager.java:1302)
    at org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.distributedExchange(GridDhtPartitionsExchangeFuture.java:1453)
    at org.apache.ignite.internal.processors.cache.distributed.dht.preloader.GridDhtPartitionsExchangeFuture.init(GridDhtPartitionsExchangeFuture.java:806)
    at org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager$ExchangeWorker.body0(GridCachePartitionExchangeManager.java:2667)
    at org.apache.ignite.internal.processors.cache.GridCachePartitionExchangeManager$ExchangeWorker.body(GridCachePartitionExchangeManager.java:2539)
    at org.apache.ignite.internal.util.worker.GridWorker.run(GridWorker.java:120)
    at java.lang.Thread.run(Thread.java:748)

客户端配置:

df.select(
    df.col("eventDate"),
    df.col("eventHour"),
    df.col("parentS2CellId"),
    df.col("s2CellId"),
    df.col("theta")
).write.format(
    FORMAT_IGNITE
).mode(
    "Append"
).option(
    OPTION_STREAMER_ALLOW_OVERWRITE, true
).option(
    OPTION_CONFIG_FILE, config.configFilePath
).option(
    OPTION_TABLE, "MyCache"
).save

服务器配置:

<bean id="ignite.cfg" class="org.apache.ignite.configuration.IgniteConfiguration">
    <property name="peerClassLoadingEnabled" value="true"/>
    <property name="failureHandler">
        <bean class="org.apache.ignite.failure.NoOpFailureHandler"/>
    </property>
    <property name="binaryConfiguration">
        <bean class="org.apache.ignite.configuration.BinaryConfiguration">
            <property name="compactFooter" value="false"/>
        </bean>
    </property>
    <property name="clientFailureDetectionTimeout" value="900000"/>
    <property name="failureDetectionTimeout" value="900000"/>
    <property name="systemWorkerBlockedTimeout" value="900000"/>
    <property name="clientMode" value="true"/>
    <property name="communicationSpi">
        <bean class="org.apache.ignite.spi.communication.tcp.TcpCommunicationSpi">
            <property name="localPort" value="${TCP_COMMUNICATION_SPI_PORT}"/>
        </bean>
    </property>
    <property name="discoverySpi">
        <bean class="org.apache.ignite.spi.discovery.tcp.TcpDiscoverySpi">
            <property name="localPort" value="${TCP_DISCOVERY_SPI_PORT}"/>
            <property name="ipFinder">
                <bean class="org.apache.ignite.spi.discovery.tcp.ipfinder.kubernetes.TcpDiscoveryKubernetesIpFinder">
                    <property name="namespace" value="${K8S_NAMESPACE}"/>
                    <property name="serviceName" value="${K8S_SERVICE_NAME}"/>
                    <property name="masterUrl" value="https://${K8S_MASTER_URL}:443"/>
                    <property name="accountToken" value="${K8S_ACCOUNT_TOKEN_PATH}"/>                        
                </bean>
            </property>
        </bean>
    </property>
</bean>

0 个答案:

没有答案
相关问题