我试图确定我要通过外部表格导入的CSV文件中的行数。我需要知道在我的程序中确定的总行数以作出相应的反应。
问题是,最快的方法是什么?
答案 0 :(得分:1)
两个显而易见的选项是只计算外部表中的行,或逐行读取文件并计算它们。使用小文件没有太大区别,逐行方法可以稍微快一点;但是对于更大的文件,外部表查询的本地特性使其显着更快。
但您可以使用文件大小作为行数的近似值 - 这取决于您需要的准确度。您可以直接执行此操作,也可以通过BFILE执行此操作,并且可以通过BFILE更快地加载CLOB以使UTL_FILE读取文件 - 然后可以通过查找换行符来计算行数。
测试块:
declare
l_time pls_integer;
l_count pls_integer;
l_file utl_file.file_type;
l_line varchar2(32767);
l_bfile bfile;
l_clob clob;
l_dest_offset pls_integer := 1;
l_src_offset pls_integer := 1;
l_lang_context pls_integer := dbms_lob.default_lang_ctx;
l_warning pls_integer;
l_exists boolean;
l_file_length pls_integer;
l_block_size pls_integer;
begin
l_time := dbms_utility.get_cpu_time;
select count(*) into l_count from t42_ext;
dbms_output.put_line('external table: count ' || l_count
|| ' took ' || (dbms_utility.get_cpu_time - l_time));
l_time := dbms_utility.get_cpu_time;
l_count := 0;
l_file := utl_file.fopen('D42', 'root.dat', 'r', 32767);
loop
begin
utl_file.get_line(l_file, l_line, 32767);
exception
when no_data_found then
exit;
end;
l_count := l_count + 1;
end loop;
utl_file.fclose(l_file);
dbms_output.put_line('utl_file read loop: count ' || l_count
|| ' took ' || (dbms_utility.get_cpu_time - l_time));
l_time := dbms_utility.get_cpu_time;
utl_file.fgetattr('D42', 'root.dat', l_exists, l_file_length, l_block_size);
dbms_output.put_line('utl_file fgetattr: size ' || l_file_length
|| ' took ' || (dbms_utility.get_cpu_time - l_time));
l_time := dbms_utility.get_cpu_time;
l_bfile := bfilename('D42', 'root.dat');
dbms_output.put_line('bfile getlength: size ' || dbms_lob.getLength(l_bfile)
|| ' took ' || (dbms_utility.get_cpu_time - l_time));
l_time := dbms_utility.get_cpu_time;
dbms_lob.open(l_bfile, dbms_lob.lob_readonly);
dbms_lob.createtemporary(l_clob, false);
dbms_lob.loadclobfromfile(l_clob, l_bfile, dbms_lob.getLength(l_bfile),
l_dest_offset, l_src_offset, dbms_lob.default_csid, l_lang_context, l_warning );
dbms_lob.close(l_bfile);
dbms_output.put_line('clob getlength: size ' || dbms_lob.getLength(l_bfile)
|| ' took ' || (dbms_utility.get_cpu_time - l_time));
l_time := dbms_utility.get_cpu_time;
dbms_output.put_line('clob length diff: count '
|| (length(l_clob) - length(replace(l_clob, chr(10))))
|| ' took ' || (dbms_utility.get_cpu_time - l_time));
l_time := dbms_utility.get_cpu_time;
dbms_output.put_line('clob regexp_count: count '
|| regexp_count(l_clob, chr(10))
|| ' took ' || (dbms_utility.get_cpu_time - l_time));
dbms_lob.freetemporary(l_clob);
end;
/
针对两个示例文件运行(Linux上为11gR2):
PL/SQL procedure successfully completed.
external table: count 1024 took 1
utl_file read loop: count 1024 took 1
utl_file fgetattr: size 189440 took 0
bfile getlength: size 189440 took 0
clob getlength: size 189440 took 2
clob length diff: count 1024 took 0
clob regexp_count: count 1024 took 22
PL/SQL procedure successfully completed.
external table: count 1048576 took 85
utl_file read loop: count 1048576 took 1403
utl_file fgetattr: size 193986560 took 0
bfile getlength: size 193986560 took 0
clob getlength: size 193986560 took 742
clob length diff: count 1048576 took 374
clob regexp_count: count 1048576 took 21808
操作系统和/或存储设备文件缓存也可能会产生一些影响。但基于此测试,检查文件大小是最快的,紧接着通过对外部表中的行进行本机计数。计算加载的CLOB中的换行符要慢得多,并且通过UTL_FILE的逐行计数仍然明显更慢(不足为奇)。使用CLOB的正则表达式计数太慢而无法考虑,即使对于较小的文件也是如此。
您应该测试您的环境并使用实际数据进行测试。
答案 1 :(得分:0)
我按照以下方式解决了
我实现了一个shell脚本( rowcount.sh ),它驻留在具有以下内容的数据库服务器上
/usr/bin/cat $1 | /usr/bin/wc -l
在oracle中创建了一个目录
CREATE DIRECTORY exec_dir AS '/home/oracle/';
GRANT READ, EXECUTE ON DIRECTORY exec_dir TO stc;
实现了以下功能,以利用外部表格和预处理程序的功能。
CREATE OR REPLACE PROCEDURE file_row_count(filename IN VARCHAR2, line_count OUT NUMBER)
IS
l_random_table_name VARCHAR2(30):= 'import_'||SUBSTR(SYS_GUID(),1,15);
l_count NUMBER;
BEGIN
EXECUTE IMMEDIATE 'CREATE TABLE '||l_random_table_name||'(
line_count NUMBER
)
ORGANIZATION EXTERNAL (
TYPE ORACLE_LOADER
DEFAULT DIRECTORY EXT_CSV_DIR
ACCESS PARAMETERS (
RECORDS DELIMITED BY NEWLINE
PREPROCESSOR exec_dir:''rowcount.sh''
NOBADFILE
NODISCARDFILE
NOLOGFILE
FIELDS
)
LOCATION (EXT_CSV_DIR:'''||filename||''')
)
REJECT LIMIT UNLIMITED
NOPARALLEL
NOMONITORING'
;
EXECUTE IMMEDIATE 'SELECT LINE_COUNT FROM '||l_random_table_name INTO line_count;
EXECUTE IMMEDIATE 'DROP TABLE '||l_random_table_name;
EXCEPTION
WHEN OTHERS THEN
dbms_output.put_line(to_char(SQLERRM));
dbms_output.put_line('try to drop temp table'||l_random_table_name);
SELECT COUNT(*) INTO l_count
FROM SYS.USER_TABLES
WHERE table_name = UPPER(l_random_table_name)
;
IF l_count > 0 THEN
EXECUTE IMMEDIATE 'DROP TABLE '||l_random_table_name;
END IF;
END file_row_count;
/
使用您知道必须导入的文件的功能
DECLARE
L_FILENAME VARCHAR2(200);
L_LINE_COUNT NUMBER;
BEGIN
L_FILENAME := 'data0.csv';
FILE_ROW_COUNT(
FILENAME => L_FILENAME,
LINE_COUNT => L_LINE_COUNT
);
DBMS_OUTPUT.PUT_LINE('LINE_COUNT = ' || L_LINE_COUNT);
--rollback; l
END;