Daily Archives: September 11, 2014

Auto-Profiling for a Table in Oracle

As database (and IT) professionals, we’ve all been in the situation of doing manual profiling of data in tables. How many rows does the table have? For any given column, are there any NULLs, what is the cardinality of the data, what are the sample types of data that exist in the column?

There are automated tools that do this sort of thing — Talend, for one, is a vendor that creates software that helps with things like this — but sometimes you just want a quick-and-dirty script to get things done for you.

I’ve spent a fair amount of time doing this recently and answering client questions about this in their systems, and found manual execution of these kinds of queries to be time-consuming. So like any good programmer, I set out to automate the task! I wrote a PL-SQL script that, given a particular schema and table in an Oracle database, runs a set of queries to give various pieces of information about the columns. It outputs HTML that you can cut-and-paste out of SQL Developer (or your Oracle client of choice) and save into an HTML file.

EDIT 4/13/2015: I was working on some further profiling tasks and made some big-fixes and cleanup to this script. These edits involve a) Improving support for DATE and TIMESTAMP data types in profiling, b) now showing the datatype of the column in question in each row, and c) adjustments to substitution and bind variables to allow the script to be run as a report in the newest version of SQL Developer.

Info that the updated version of the script gives:

  1. Table Name
  2. Column Name
  3. Data Type (of the Column)
  4. Minimum Value (if Numeric, date, or datetime datatype)
  5. Mean (if Numeric datatype)
  6. Median (if Numeric, date, or datetime datatype)
  7. Maximum (if Numeric, date, or datetime datatype)
  8. Standard Deviation (if Numeric datatype)
  9. Count of Distinct Values
  10. Count of Non-NULL Rows
  11. Count of Non-Zero and Non-NULL Rows (tables I have been looking at often have 0s instead of NULLs, but the 0 doesn’t give any meaningful info either and I wanted to exclude it, does not work for date or datetime datatypes)
  12. Percent of Non-NULL Rows
  13. Percent of Non-Zero and Non-NULL Rows (only for non-datetime and timestamp datatypes)

This is the kind of output you can expect from the script:

profile_example

I hope this is useful to you! Let me know if you have any questions, find any bugs with the script, or have any suggestions for improvement.

WARNING: IF YOU RUN THIS SCRIPT ON A BIG TABLE IT WILL TAKE A LONG TIME AND YOUR DBA MAY NOT BE HAPPY WITH YOU. By big, I mean anything with over a million rows or so and/or more than 50-ish columns or thereof.

Here is the script:

SET SERVEROUTPUT ON;
ALTER SESSION SET NLS_DATE_FORMAT = 'MM/DD/YYYY';

DECLARE

TYPE deltaCurType             IS REF CURSOR;
j_deltatablenames_cur         deltaCurType;
rec_str_deltatablenames       ALL_TAB_COLUMNS%ROWTYPE;
v_object_str                  VARCHAR2(4000);
v_stmt_str                    VARCHAR2(4000);
v_OWNER_NAME                  VARCHAR2(2000) := &OWNER_NAME;
v_TABLE_NAME                  VARCHAR2(2000) := &TABLE_NAME;
v_COLUMN_NAME                 VARCHAR2(2000);
v_COLUMN_TYPE                 VARCHAR2(2000);
v_MIN                         VARCHAR2(2000);
v_AVERAGE_MEAN                NUMBER;
v_AVERAGE_MEDIAN              VARCHAR2(2000);
v_MAX                         VARCHAR2(2000);
v_STANDARD_DEVIATION          NUMBER;
v_CNT_NON_NULL_ROWS           NUMBER;
v_CNT_NON_ZERO_NON_NULL_ROWS  NUMBER;
v_CNT_TOTAL_ROWS              NUMBER;
v_PCT_NON_NULL_ROWS             NUMBER;
v_PCT_NON_ZERO_NON_NULL_ROWS     NUMBER;
v_CNT_DISTINCT_VALUES         NUMBER;
v_code                        NUMBER;
v_errm                        VARCHAR2(64);

BEGIN

DBMS_OUTPUT.ENABLE(1000000);

v_object_str := 'SELECT * FROM ALL_TAB_COLUMNS WHERE OWNER = ''' || v_OWNER_NAME || ''' AND TABLE_NAME = ''' || v_TABLE_NAME || ''' ORDER BY COLUMN_ID';

OPEN j_deltatablenames_cur FOR v_object_str;

FETCH j_deltatablenames_cur INTO rec_str_deltatablenames;

DBMS_OUTPUT.PUT_LINE('<HTML><TABLE BORDER="1">');

DBMS_OUTPUT.PUT_LINE('REPORT RUN AT ' || SYSTIMESTAMP);

DBMS_OUTPUT.PUT_LINE('<TR><TD><B>TABLE NAME</B></TD>
<TD>COLUMN NAME</TD>
<TD>DATA TYPE</TD>
<TD>MINIMUM</TD>
<TD>MEAN</TD>
<TD>MEDIAN</TD>
<TD>MAXIMUM</TD>
<TD>STANDARD DEVIATION</TD>
<TD>COUNT DISTINCT VALUES</TD>
<TD>COUNT NON-NULL ROWS</TD>
<TD>COUNT NON-NULL AND NON-ZERO ROWS</TD>
<TD>COUNT TOTAL ROWS</TD>
<TD>PERCENT NON-NULL ROWS</TD>
<TD>PERCENT NON-NULL AND NON-ZERO ROWS</TD>
</TR>');

WHILE j_deltatablenames_cur%FOUND LOOP

v_stmt_str := 'SELECT ''' || v_TABLE_NAME || ''' AS TABLE_NAME, '''
|| rec_str_deltatablenames.COLUMN_NAME || ''' AS COLUMN_NAME, ';

v_stmt_str := v_stmt_str || '''' || rec_str_deltatablenames.DATA_TYPE || ''' AS DATA_TYPE, ';

IF rec_str_deltatablenames.DATA_TYPE IN ('NUMBER', 'DATE') OR rec_str_deltatablenames.DATA_TYPE LIKE 'TIMESTAMP%' THEN
v_stmt_str := v_stmt_str || 'MIN(' || rec_str_deltatablenames.COLUMN_NAME || ') AS MIN, ';
ELSE
v_stmt_str := v_stmt_str || 'NULL AS MIN, ';
END IF;

IF rec_str_deltatablenames.DATA_TYPE = 'NUMBER' THEN
v_stmt_str := v_stmt_str || 'ROUND(AVG(' || rec_str_deltatablenames.COLUMN_NAME || '), 2) AS AVERAGE_MEAN, ';
ELSE
v_stmt_str := v_stmt_str || 'NULL AS AVERAGE_MEAN, ';
END IF;

IF rec_str_deltatablenames.DATA_TYPE IN ('NUMBER', 'DATE') OR rec_str_deltatablenames.DATA_TYPE LIKE 'TIMESTAMP%' THEN
v_stmt_str := v_stmt_str || 'MEDIAN(' || rec_str_deltatablenames.COLUMN_NAME || ') AS AVERAGE_MEDIAN, ';
ELSE
v_stmt_str := v_stmt_str || 'NULL AS AVERAGE_MEDIAN, ';
END IF;

IF rec_str_deltatablenames.DATA_TYPE IN ('NUMBER', 'DATE') OR rec_str_deltatablenames.DATA_TYPE LIKE 'TIMESTAMP%' THEN
v_stmt_str := v_stmt_str || 'MAX(' || rec_str_deltatablenames.COLUMN_NAME || ') AS MAX, ';
ELSE
v_stmt_str := v_stmt_str || 'NULL AS MAX, ';
END IF;

IF rec_str_deltatablenames.DATA_TYPE = 'NUMBER' THEN
v_stmt_str := v_stmt_str || 'ROUND(STDDEV(' || rec_str_deltatablenames.COLUMN_NAME || '), 2) AS STANDARD_DEVIATION, ';
ELSE
v_stmt_str := v_stmt_str || 'NULL AS STANDARD_DEVIATION, ';
END IF;

v_stmt_str := v_stmt_str || ' COUNT(DISTINCT(' || rec_str_deltatablenames.COLUMN_NAME || ')) AS CNT_DISTINCT_VALUES, ';

v_stmt_str := v_stmt_str || ' COUNT(' || rec_str_deltatablenames.COLUMN_NAME || ') AS CNT_NON_NULL_ROWS, ';

IF rec_str_deltatablenames.DATA_TYPE = 'NUMBER' THEN
v_stmt_str := v_stmt_str || 'SUM(CASE WHEN ' || rec_str_deltatablenames.COLUMN_NAME || ' <> 0 THEN 1 ELSE 0 END) AS CNT_NON_ZERO_NON_NULL_ROWS, ';
ELSIF rec_str_deltatablenames.DATA_TYPE LIKE 'TIMESTAMP%' OR rec_str_deltatablenames.DATA_TYPE = 'DATE' THEN
v_stmt_str := v_stmt_str || 'NULL AS CNT_NON_ZERO_NON_NULL_ROWS, ';
ELSE
v_stmt_str := v_stmt_str || 'SUM(CASE WHEN ' || rec_str_deltatablenames.COLUMN_NAME || ' <> ''0'' THEN 1 ELSE 0 END) AS CNT_NON_ZERO_NON_NULL_ROWS, ';
END IF;

v_stmt_str := v_stmt_str || 'COUNT(*) AS CNT_TOTAL_ROWS, ROUND(COUNT(' || rec_str_deltatablenames.COLUMN_NAME || ')/COUNT(*), 4) * 100 AS PCT_NON_NULL_ROWS, ';

IF rec_str_deltatablenames.DATA_TYPE = 'NUMBER' THEN
v_stmt_str := v_stmt_str || 'ROUND(SUM(CASE WHEN ' || rec_str_deltatablenames.COLUMN_NAME || ' <> 0 THEN 1 ELSE 0 END)/COUNT(*), 4) * 100 AS PCT_NON_ZERO_NON_NULL_ROWS ';
ELSIF rec_str_deltatablenames.DATA_TYPE LIKE 'TIMESTAMP%' OR rec_str_deltatablenames.DATA_TYPE = 'DATE' THEN
v_stmt_str := v_stmt_str || 'NULL AS PCT_NON_ZERO_NON_NULL_ROWS ';
ELSE
v_stmt_str := v_stmt_str || 'ROUND(SUM(CASE WHEN ' || rec_str_deltatablenames.COLUMN_NAME || ' <> ''0'' THEN 1 ELSE 0 END)/COUNT(*), 4) * 100 AS PCT_NON_ZERO_NON_NULL_ROWS ';
END IF;

v_stmt_str := v_stmt_str || ' FROM ' || v_TABLE_NAME || ' WHERE PRODUCT_TYPE_CODE IN (''FEAT'', ''TVMOV'') AND CURRENT_INDICATOR = ''Y''';

--DBMS_OUTPUT.PUT_LINE(v_stmt_str); -- for debugging

EXECUTE IMMEDIATE v_stmt_str INTO v_TABLE_NAME,   v_COLUMN_NAME,   v_COLUMN_TYPE, v_MIN,   v_AVERAGE_MEAN,   v_AVERAGE_MEDIAN,   v_MAX,   v_STANDARD_DEVIATION,  v_CNT_DISTINCT_VALUES, v_CNT_NON_NULL_ROWS,   v_CNT_NON_ZERO_NON_NULL_ROWS,   v_CNT_TOTAL_ROWS,   v_PCT_NON_NULL_ROWS,   v_PCT_NON_ZERO_NON_NULL_ROWS;

DBMS_OUTPUT.PUT_LINE('<TR><TD>' || v_TABLE_NAME || '</TD><TD>' ||   v_COLUMN_NAME || '</TD><TD>' || v_COLUMN_TYPE || '</TD><TD>' ||  v_MIN || '</TD><TD>' ||   v_AVERAGE_MEAN || '</TD><TD>' ||   v_AVERAGE_MEDIAN || '</TD><TD>' ||   v_MAX || '</TD><TD>' ||   v_STANDARD_DEVIATION || '</TD><TD>' || v_CNT_DISTINCT_VALUES || '</TD><TD>' ||   v_CNT_NON_NULL_ROWS || '</TD><TD>' ||   v_CNT_NON_ZERO_NON_NULL_ROWS || '</TD><TD>' ||   v_CNT_TOTAL_ROWS || '</TD><TD>' ||   v_PCT_NON_NULL_ROWS || '</TD><TD>' ||   v_PCT_NON_ZERO_NON_NULL_ROWS || '</TD></TR>');

FETCH j_deltatablenames_cur INTO rec_str_deltatablenames;
END LOOP;

CLOSE j_deltatablenames_cur;

DBMS_OUTPUT.PUT_LINE('</TABLE></HTML>');

EXCEPTION

WHEN OTHERS THEN
v_code := SQLCODE;
v_errm := SUBSTR(SQLERRM, 1, 64);
DBMS_OUTPUT.PUT_LINE (v_code || ' ' || v_errm);
DBMS_OUTPUT.PUT_LINE(v_stmt_str);

END;

 

The above is the version of the script to execute in Toad or in SQL Developer as a script. HOWEVER, if you would like to implement this in a report in SQL Developer, there are some minor tweaks that I had to make. SQL Developer reports do not seem to like substitution variables, but do function with bind variables. So I changed to substitution variables and also removed the couple of initialization lines before the DECLARE statement.

Version for SQL Developer Report (create report as type PL/SQL DBMS Output)

DECLARE

TYPE deltaCurType             IS REF CURSOR;
j_deltatablenames_cur         deltaCurType;
rec_str_deltatablenames       ALL_TAB_COLUMNS%ROWTYPE;
v_object_str                  VARCHAR2(4000);
v_stmt_str                    VARCHAR2(4000);
v_OWNER_NAME                  VARCHAR2(2000) := :OWNER_NAME;
v_TABLE_NAME                  VARCHAR2(2000) := :TABLE_NAME;
v_COLUMN_NAME                 VARCHAR2(2000);
v_COLUMN_TYPE                 VARCHAR2(2000);
v_MIN                         VARCHAR2(2000);
v_AVERAGE_MEAN                NUMBER;
v_AVERAGE_MEDIAN              VARCHAR2(2000);
v_MAX                         VARCHAR2(2000);
v_STANDARD_DEVIATION          NUMBER;
v_CNT_NON_NULL_ROWS           NUMBER;
v_CNT_NON_ZERO_NON_NULL_ROWS  NUMBER;
v_CNT_TOTAL_ROWS              NUMBER;
v_PCT_NON_NULL_ROWS             NUMBER;
v_PCT_NON_ZERO_NON_NULL_ROWS     NUMBER;
v_CNT_DISTINCT_VALUES         NUMBER;
v_code                        NUMBER;
v_errm                        VARCHAR2(64);

BEGIN

DBMS_OUTPUT.ENABLE(1000000);

v_object_str := 'SELECT * FROM ALL_TAB_COLUMNS WHERE OWNER = ''' || v_OWNER_NAME || ''' AND TABLE_NAME = ''' || v_TABLE_NAME || ''' ORDER BY COLUMN_ID';

OPEN j_deltatablenames_cur FOR v_object_str;

FETCH j_deltatablenames_cur INTO rec_str_deltatablenames;

DBMS_OUTPUT.PUT_LINE('<HTML><TABLE BORDER="1">');

DBMS_OUTPUT.PUT_LINE('REPORT RUN AT ' || SYSTIMESTAMP);

DBMS_OUTPUT.PUT_LINE('<TR><TD><B>TABLE NAME</B></TD>
<TD>COLUMN NAME</TD>
<TD>DATA TYPE</TD>
<TD>MINIMUM</TD>
<TD>MEAN</TD>
<TD>MEDIAN</TD>
<TD>MAXIMUM</TD>
<TD>STANDARD DEVIATION</TD>
<TD>COUNT DISTINCT VALUES</TD>
<TD>COUNT NON-NULL ROWS</TD>
<TD>COUNT NON-NULL AND NON-ZERO ROWS</TD>
<TD>COUNT TOTAL ROWS</TD>
<TD>PERCENT NON-NULL ROWS</TD>
<TD>PERCENT NON-NULL AND NON-ZERO ROWS</TD>
</TR>');

WHILE j_deltatablenames_cur%FOUND LOOP

v_stmt_str := 'SELECT ''' || v_TABLE_NAME || ''' AS TABLE_NAME, '''
|| rec_str_deltatablenames.COLUMN_NAME || ''' AS COLUMN_NAME, ';

v_stmt_str := v_stmt_str || '''' || rec_str_deltatablenames.DATA_TYPE || ''' AS DATA_TYPE, ';

IF rec_str_deltatablenames.DATA_TYPE IN ('NUMBER', 'DATE') OR rec_str_deltatablenames.DATA_TYPE LIKE 'TIMESTAMP%' THEN
v_stmt_str := v_stmt_str || 'MIN(' || rec_str_deltatablenames.COLUMN_NAME || ') AS MIN, ';
ELSE
v_stmt_str := v_stmt_str || 'NULL AS MIN, ';
END IF;

IF rec_str_deltatablenames.DATA_TYPE = 'NUMBER' THEN
v_stmt_str := v_stmt_str || 'ROUND(AVG(' || rec_str_deltatablenames.COLUMN_NAME || '), 2) AS AVERAGE_MEAN, ';
ELSE
v_stmt_str := v_stmt_str || 'NULL AS AVERAGE_MEAN, ';
END IF;

IF rec_str_deltatablenames.DATA_TYPE IN ('NUMBER', 'DATE') OR rec_str_deltatablenames.DATA_TYPE LIKE 'TIMESTAMP%' THEN
v_stmt_str := v_stmt_str || 'MEDIAN(' || rec_str_deltatablenames.COLUMN_NAME || ') AS AVERAGE_MEDIAN, ';
ELSE
v_stmt_str := v_stmt_str || 'NULL AS AVERAGE_MEDIAN, ';
END IF;

IF rec_str_deltatablenames.DATA_TYPE IN ('NUMBER', 'DATE') OR rec_str_deltatablenames.DATA_TYPE LIKE 'TIMESTAMP%' THEN
v_stmt_str := v_stmt_str || 'MAX(' || rec_str_deltatablenames.COLUMN_NAME || ') AS MAX, ';
ELSE
v_stmt_str := v_stmt_str || 'NULL AS MAX, ';
END IF;

IF rec_str_deltatablenames.DATA_TYPE = 'NUMBER' THEN
v_stmt_str := v_stmt_str || 'ROUND(STDDEV(' || rec_str_deltatablenames.COLUMN_NAME || '), 2) AS STANDARD_DEVIATION, ';
ELSE
v_stmt_str := v_stmt_str || 'NULL AS STANDARD_DEVIATION, ';
END IF;

v_stmt_str := v_stmt_str || ' COUNT(DISTINCT(' || rec_str_deltatablenames.COLUMN_NAME || ')) AS CNT_DISTINCT_VALUES, ';

v_stmt_str := v_stmt_str || ' COUNT(' || rec_str_deltatablenames.COLUMN_NAME || ') AS CNT_NON_NULL_ROWS, ';

IF rec_str_deltatablenames.DATA_TYPE = 'NUMBER' THEN
v_stmt_str := v_stmt_str || 'SUM(CASE WHEN ' || rec_str_deltatablenames.COLUMN_NAME || ' <> 0 THEN 1 ELSE 0 END) AS CNT_NON_ZERO_NON_NULL_ROWS, ';
ELSIF rec_str_deltatablenames.DATA_TYPE LIKE 'TIMESTAMP%' OR rec_str_deltatablenames.DATA_TYPE = 'DATE' THEN
v_stmt_str := v_stmt_str || 'NULL AS CNT_NON_ZERO_NON_NULL_ROWS, ';
ELSE
v_stmt_str := v_stmt_str || 'SUM(CASE WHEN ' || rec_str_deltatablenames.COLUMN_NAME || ' <> ''0'' THEN 1 ELSE 0 END) AS CNT_NON_ZERO_NON_NULL_ROWS, ';
END IF;

v_stmt_str := v_stmt_str || 'COUNT(*) AS CNT_TOTAL_ROWS, ROUND(COUNT(' || rec_str_deltatablenames.COLUMN_NAME || ')/COUNT(*), 4) * 100 AS PCT_NON_NULL_ROWS, ';

IF rec_str_deltatablenames.DATA_TYPE = 'NUMBER' THEN
v_stmt_str := v_stmt_str || 'ROUND(SUM(CASE WHEN ' || rec_str_deltatablenames.COLUMN_NAME || ' <> 0 THEN 1 ELSE 0 END)/COUNT(*), 4) * 100 AS PCT_NON_ZERO_NON_NULL_ROWS ';
ELSIF rec_str_deltatablenames.DATA_TYPE LIKE 'TIMESTAMP%' OR rec_str_deltatablenames.DATA_TYPE = 'DATE' THEN
v_stmt_str := v_stmt_str || 'NULL AS PCT_NON_ZERO_NON_NULL_ROWS ';
ELSE
v_stmt_str := v_stmt_str || 'ROUND(SUM(CASE WHEN ' || rec_str_deltatablenames.COLUMN_NAME || ' <> ''0'' THEN 1 ELSE 0 END)/COUNT(*), 4) * 100 AS PCT_NON_ZERO_NON_NULL_ROWS ';
END IF;

v_stmt_str := v_stmt_str || ' FROM ' || v_TABLE_NAME || ' WHERE PRODUCT_TYPE_CODE IN (''FEAT'', ''TVMOV'') AND CURRENT_INDICATOR = ''Y''';

--DBMS_OUTPUT.PUT_LINE(v_stmt_str); -- for debugging

EXECUTE IMMEDIATE v_stmt_str INTO v_TABLE_NAME,   v_COLUMN_NAME,   v_COLUMN_TYPE, v_MIN,   v_AVERAGE_MEAN,   v_AVERAGE_MEDIAN,   v_MAX,   v_STANDARD_DEVIATION,  v_CNT_DISTINCT_VALUES, v_CNT_NON_NULL_ROWS,   v_CNT_NON_ZERO_NON_NULL_ROWS,   v_CNT_TOTAL_ROWS,   v_PCT_NON_NULL_ROWS,   v_PCT_NON_ZERO_NON_NULL_ROWS;

DBMS_OUTPUT.PUT_LINE('<TR><TD>' || v_TABLE_NAME || '</TD><TD>' ||   v_COLUMN_NAME || '</TD><TD>' || v_COLUMN_TYPE || '</TD><TD>' ||  v_MIN || '</TD><TD>' ||   v_AVERAGE_MEAN || '</TD><TD>' ||   v_AVERAGE_MEDIAN || '</TD><TD>' ||   v_MAX || '</TD><TD>' ||   v_STANDARD_DEVIATION || '</TD><TD>' || v_CNT_DISTINCT_VALUES || '</TD><TD>' ||   v_CNT_NON_NULL_ROWS || '</TD><TD>' ||   v_CNT_NON_ZERO_NON_NULL_ROWS || '</TD><TD>' ||   v_CNT_TOTAL_ROWS || '</TD><TD>' ||   v_PCT_NON_NULL_ROWS || '</TD><TD>' ||   v_PCT_NON_ZERO_NON_NULL_ROWS || '</TD></TR>');

FETCH j_deltatablenames_cur INTO rec_str_deltatablenames;
END LOOP;

CLOSE j_deltatablenames_cur;

DBMS_OUTPUT.PUT_LINE('</TABLE></HTML>');

EXCEPTION

WHEN OTHERS THEN
v_code := SQLCODE;
v_errm := SUBSTR(SQLERRM, 1, 64);
DBMS_OUTPUT.PUT_LINE (v_code || ' ' || v_errm);
DBMS_OUTPUT.PUT_LINE(v_stmt_str);

END;