Thursday, September 08, 2005

Analytics with ORDER BY and DISTINCT STRAGG

A recent request in a help forum required an SQL that would return rows until 5 distinct values of a specific column had been returned.

Using analytics was proving difficult as the DISTINCT clause wouldn't work with an ORDER BY. While a non-analytical function was found, I offered an analytic SQL that used a user-defined aggregate. This is similar to STRAGG but eliminates duplicates prior to returning the result string. [The elimination process involves 'associative arrays' - the old INDEX BY tables with a VARCHAR2 index. This was new in 9iR2 so won't work in earlier versions where you'd need to develop your own sort routine.]


create or replace type str_dist_agg_type as object
(
total varchar2(4000),
static function
ODCIAggregateInitialize(sctx IN OUT str_dist_agg_type )
return number,
member function
ODCIAggregateIterate(self IN OUT str_dist_agg_type ,
value IN varchar2 )
return number,
member function
ODCIAggregateTerminate(self IN str_dist_agg_type,
returnValue OUT varchar2,
flags IN number)
return number,
member function
ODCIAggregateMerge(self IN OUT str_dist_agg_type,
ctx2 IN str_dist_agg_type)
return number,
member function
ODCIAggregateDelete(self IN OUT str_dist_agg_type,
value in varchar2)
return number
)
/

create or replace type body str_dist_agg_type
is
--
static function ODCIAggregateInitialize(sctx IN OUT str_dist_agg_type)
return number
is
begin
sctx := str_dist_agg_type( null );
return ODCIConst.Success;
end;
--
member function ODCIAggregateIterate(self IN OUT str_dist_agg_type,
value IN varchar2 )
return number
is
begin
--Use chr(7) as an internal separator because, since
--it is a pretty useless control character, it is unlikely
--to be in the input data or cause any internal problems.
self.total := self.total || value || chr(7);
return ODCIConst.Success;
end;
--
member function ODCIAggregateTerminate(self IN str_dist_agg_type,
returnValue OUT varchar2,
flags IN number)
return number
is
type tab_dist is table of number index by varchar2(4000);
t_dist tab_dist;
v_point number := 0;
v_str varchar2(4000);
v_sep varchar2(1) := chr(7);
v_ret varchar2(4000);
begin
v_str := rtrim(ltrim(self.total,v_sep),v_sep);
v_point := instr(v_str,v_sep);
while nvl(v_point,0) != 0 loop
t_dist(substr(v_str,1,v_point-1)) := 1;
v_str := ltrim(substr(v_str,v_point),v_sep);
v_point := instr(v_str,v_sep);
end loop;
t_dist(v_str) := 1;
--
v_str := t_dist.first;
while v_str is not null loop
v_ret := v_ret || v_str;
v_str := t_dist.next(v_str);
end loop;
returnValue := v_ret;
return ODCIConst.Success;
end;
--
member function ODCIAggregateDelete(self IN OUT str_dist_agg_type,
value in varchar2)
return number
is
v_point number;
begin
v_point := instr(self.total,value||chr(7));
if v_point > 0 then
self.total := substr(self.total,1,v_point-1)||
substr(self.total, v_point+length(value)+1);
end if;
return ODCIConst.Success;
end;
--
member function ODCIAggregateMerge(self IN OUT str_dist_agg_type,
ctx2 IN str_dist_agg_type)
return number
is
begin
self.total := self.total ||chr(7)||ctx2.total;
return ODCIConst.Success;
end;
end;
/

CREATE or replace
FUNCTION strdagg(input varchar2 )
RETURN varchar2
PARALLEL_ENABLE AGGREGATE USING str_dist_agg_type;

grant execute on strdagg to public;

Note : STRDAGG uses the CHR(7) character internally as a delimiter, so if this is in the data you are likely to see problems. No 'visible' delimiter is shown, so if you want one you'll need to specify it as part of the expression :

select table_name, column_name, strdagg(data_type||',') over
(partition by table_name order by table_name, column_id) data_types
from user_tab_columns
where table_name in ('CHILD','PARENT')

TABLE_NAME COLUMN_NAME DATA_TYPES
---------- --------------- --------------------
CHILD ID NUMBER,
CHILD PARENT_ID NUMBER,
CHILD DESCRIPTION NUMBER,VARCHAR2,
PARENT ID NUMBER,
PARENT DESCRIPTION NUMBER,VARCHAR2,

I've put all the 'tricky' processing in the TERMINATE function so it can be easily amended (eg if you wanted to return the number of duplicates for each value or have a case-insensitive sort).

(If you are interested in the original problem and the non-analytical solution, it is in the Quest Pipelines' PL/SQL forum entitled 'Tricky SQL Challenge' in the 'Writing and Tuning SQL' conference.)

No comments: