我无法将数据(由集合声明)插入日分区表

时间:2019-01-23 11:43:41

标签: hive hiveql

我有一个按天分区的表。我尝试通过设置

插入数据

设置hivevar:ds = 2018-12-01;

然后使用** INSERT OVERWRITE表XTABLE分区(day ='$ {hivevar:ds}')** 效果很好

但是当我喜欢下面的时候

设置hivevar:pd = date_add('$ {hivevar:ds}',-1);

然后** INSERT OVERWRITE表XTABLE分区(day ='$ {hivevar:pd}')** 它抛出错误。我认为问题是因为额外的引号,但找不到解决方法。

错误是:

无法识别常量中``date_add('''2018''-')附近的输入

MYCODE:


set hivevar:ds=2018-12-01;
set hivevar:pd=date_add('${hivevar:ds}',-1);
set hive.exec.dynamic.partition.mode=nonstrict;

CREATE TABLE IF NOT EXISTS XTABLE (emp_id BIGINT, start_time STRING, end_time STRING)
PARTITIONED BY(day STRING) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t';


--THIS IS WORKING FINE
INSERT OVERWRITE table XTABLE partition(day='${hivevar:ds}')
select distinct d.emp_id, d.start_time, d.end_time from
(
select emp_id, start_time, end_time from XTABLE where day='${hivevar:ds}'
) d;


--THIS IS THROWING AN ERROR cannot recognize input near ''date_add('' '2018' '-' in constant
--SEEMS PROBLEM IS WHILE SETTING THE VARIABLE
INSERT OVERWRITE table XTABLE partition(day='${hivevar:pd}')
select distinct d.emp_id, d.start_time, d.end_time from
(
select emp_id, start_time, end_time from XTABLE where day='${hivevar:pd}'
) d;

如果成功,则应显示如下消息:

将数据加载到表xtable分区(day = 2018-12-01)

2 个答案:

答案 0 :(得分:1)

@saicharan设置变量时无法添加功能。 我曾经遇到过类似的问题。

set hivevar:ds ='应始终具有静态值'

要解决此问题,您需要创建一个简单的脚本,如下所示:

ds=`date -d "+1 day" +"%Y-%m-%d"`
echo $ds

hive --hivevar ds="${ds}" -e "INSERT OVERWRITE table XTABLE partition(day='${hivevar:ds}') "

这应该可以解决您的问题。让我知道它是否有效。

答案 1 :(得分:1)

当前,您正在尝试使用带有规范中功能的静态分区进行插入。您可以使用动态分区插入,在数据集中提供分区:

import cv2
import os
import time
from scipy import spatial
import xml.etree.ElementTree as ET

def parse_annotation(ann_dir, labels=[]):
    all_imgs = []
    seen_labels = {}
    img = {'object': []}
    tree = ET.parse(ann_dir)

    for elem in tree.iter():
        if 'width' in elem.tag:
            img['width'] = int(elem.text)
        if 'height' in elem.tag:
            img['height'] = int(elem.text)
        if 'object' in elem.tag or 'part' in elem.tag:
            obj = {}

            for attr in list(elem):
                if 'name' in attr.tag:
                    obj['name'] = attr.text

                    if obj['name'] in seen_labels:
                        seen_labels[obj['name']] += 1
                    else:
                        seen_labels[obj['name']] = 1

                    if len(labels) > 0 and obj['name'] not in labels:
                        break
                    else:
                        img['object'] += [obj]

                if 'bndbox' in attr.tag:
                    for dim in list(attr):
                        if 'xmin' in dim.tag:
                            obj['xmin'] = int(round(float(dim.text)))
                        if 'ymin' in dim.tag:
                            obj['ymin'] = int(round(float(dim.text)))
                        if 'xmax' in dim.tag:
                            obj['xmax'] = int(round(float(dim.text)))
                        if 'ymax' in dim.tag:
                            obj['ymax'] = int(round(float(dim.text)))

    if len(img['object']) > 0:
        all_imgs += [img]

    return all_imgs, seen_labels

labels = ['RBC', 'WBC', 'Platelet']

for ann_file in os.listdir('C:/Users/Neerajan/Desktop/Blood-Cell-Count/Testing/Annotations'):
    ann_dir = 'C:/Users/Neerajan/Desktop/Blood-Cell-Count/Testing/Annotations/' + ann_file
    ground_truths, labels = parse_annotation(ann_dir, labels)
    print(ann_file, labels)

def iou(boxA, boxB):
    xA = max(boxA[0], boxB[0])
    yA = max(boxA[1], boxB[1])
    xB = min(boxA[2], boxB[2])
    yB = min(boxA[3], boxB[3])
    interArea = (xB - xA + 1) * (yB - yA + 1)
    if interArea < 0:
        interArea = 0

    boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
    boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)

    iou = interArea / float(boxAArea + boxBArea - interArea)
    return iou

from darkflow.net.build import TFNet
import numpy as np

#Testing Dataset
options = {'model': 'C:/Users/Neerajan/Desktop/Blood-Cells- 
Project/cfg/tiny-yolo-voc-3c.cfg',
       'load': 3750,
       'threshold': 0.1,
       #'gpu': 0.7
      }

tfnet = TFNet(options)

avg_time = 0
pred_bb = []
pred_cls = []
pred_conf = []

for file_name in os.listdir('C:/Users/Neerajan/Desktop/Blood-Cell-Count/Testing/Images/'):
    tic = time.time()
    image = cv2.imread('C:/Users/Neerajan/Desktop/Blood-Cell-Count/Testing/Images/' + file_name)
    output = tfnet.return_predict(image)


    rbc = 0
    wbc = 0
    platelets = 0

    cell = []
    cls = []
    conf = []

    record = []
    tl_ = []
    br_ = []
    iou_ = []
    iou_value = 0

    for prediction in output:
        label = prediction['label']
        confidence = prediction['confidence']
        tl = (prediction['topleft']['x'], prediction['topleft']['y'])
        br = (prediction['bottomright']['x'], prediction['bottomright']['y'])
        if label == 'RBC' and confidence < .5:
            continue
        if label == 'WBC' and confidence < .25:
            continue
        if label == 'Platelets' and confidence < .25:
            continue

        # clearing up spurious platelets
        if label == 'Platelets':
            if record != []:
                tree = spatial.cKDTree(record)
                index = tree.query(tl)[1]
                iou_value = iou(tl + br, tl_[index] + br_[index])
                iou_.append(iou_value)
            if iou_value > 0.1:
                continue
            record.append(tl)
            tl_.append(tl)
            br_.append(br)

        # image = cv2.rectangle(image, tl, br,color, 2)
        center_x = int((tl[0] + br[0]) / 2)
        center_y = int((tl[1] + br[1]) / 2)
        center = (center_x, center_y)
        color = tuple(255 * np.random.rand(3))
        if label == 'RBC':
            color = (255, 0, 0)
            rbc = rbc + 1
        if label == 'WBC':
            color = (0, 255, 0)
            wbc = wbc + 1
        if label == 'Platelets':
            color = (0, 0, 255)
            platelets = platelets + 1
        radius = int((br[0] - tl[0]) / 2)
        image = cv2.circle(image, center, radius, color, 2)
        font = cv2.FONT_HERSHEY_COMPLEX
        image = cv2.putText(image, label, (center_x - 15, center_y + 5), font, .5, color, 1)
        cell.append([tl[0], tl[1], br[0], br[1]])
        if label == 'RBC': cls.append(0)
        if label == 'WBC': cls.append(1)
        if label == 'Platelets': cls.append(2)
        conf.append(confidence)
    pred_bb.append(cell)
    pred_cls.append(cls)
    pred_conf.append(conf)
    #cv2.putText(image, 'Total RBC: ' + str(rbc) + ', WBC: ' + str(wbc) + ', Platelets: ' + str(platelets), (0,image.shape[0] -10), cv2.FONT_HERSHEY_TRIPLEX, 0.5,  (0,0,0), 1)
    cv2.imshow('RBC: ' + str(rbc) + ', WBC: ' + str(wbc) + ', Platelets: ' + str(platelets), image)
    cv2.imwrite('C:/Users/Neerajan/Desktop/ReportDump/' + file_name, image)
    cv2.waitKey(0)
    cv2.destroyAllWindows()
    toc = time.time()
    avg_time = avg_time + (toc - tic) * 1000

avg_time = avg_time / 60

这将起作用,但是由于分区修剪不适用于功能,因此可能导致表完全扫描。因此,最好的解决方案是在shell中计算date-1天并将其作为参数传递到HQL脚本中:

set hivevar:ds=2018-12-01;

set hive.exec.dynamic.partition.mode=nonstrict;

INSERT OVERWRITE table XTABLE partition(day)
select distinct d.emp_id, d.start_time, d.end_time from
(
select emp_id, start_time, end_time, day --partition present in dataset, also it can be date_sub('${hivevar:ds}',1) as day
from XTABLE where day=date_sub('${hivevar:ds}',1);
) d; 

在脚本中使用ds=$(date +"%Y-%m-%d" --date " -1 day") hive --hiveconf ds="$ds" -f your_script.hql