[openGauss] I rubbed a UTL for openGauss with plsql_ URL package

preface

Recently, I began to bite the openGauss compatible package. I'll practice with a simpler one first.

UTL_ The url package has only two functions, "ESCAPE" and "unscape", which are actually two functions of URLENCODE and URLDECODE used in other development languages. They are mainly used to process some specific characters of the url string, so as not to cause ambiguity when transmitting the url string as data. They are often used. At present, three-party extensions support these two functions( https://github.com/okbob/url_encode ), but it's a little inconvenient to compile and expand in openGauss. I write one directly with plsql, which is easier to install and use.

First try to restore the logic of the two functions in this package in oracle

ESCAPE

UTL_URL.ESCAPE (
   url                   IN VARCHAR2 CHARACTER SET ANY_CS,
   escape_reserved_chars IN BOOLEAN  DEFAULT  FALSE,
   url_charset           IN VARCHAR2 DEFAULT  utl_http.body_charset)
RETURN VARCHAR2;

ESCAPE has three parameters: url string, whether to replace reserved characters (no replacement by default), and url character set
Usage examples

--Pass only URL,The other two parameters are default
select utl_url.escape('https://www.darkathena.top/archives / I started a blog ') from dual;
--https://www.darkathena.top/archives/%BF%BF%BF%BF
--Obviously,Here, because the character set is not specified,The result of conversion is incorrect

--For non ASCII Character with UTF8 code
select utl_url.escape('https://www.darkathena.top/archives / I started blogging ', URL_ charset=>'AL32UTF8') from dual;
--https://www.darkathena.top/archives/%E6%88%91%E5%BC%80%E5%8D%9A%E4%BA%86

--For non ASCII Character with GBK Encode
select utl_url.escape('https://www.darkathena.top/archives / I started blogging ', URL_ charset=>'ZHS16GBK') from dual;
--https://www.darkathena.top/archives/%CE%D2%BF%AA%B2%A9%C1%CB

--For non ASCII Character with GBK Encode,And replace all reserved characters
BEGIN
DBMS_OUTPUT.put_line( 
utl_url.escape('https://www.darkathena.top/archives / ', TRUE,'ZHS16GBK ');
END;
/
--https%3A%2F%2Fwww.darkathena.top%2Farchives%2F%CE%D2%BF%AA%B2%A9%C1%CB

It doesn't seem very troublesome. You just need to judge character by character. When you recognize the character that needs to be converted, get its binary data, express it in hexadecimal string, and format this hexadecimal string into the form of "% FF*n".
The code is as follows

CREATE OR REPLACE FUNCTION URLENCODE(url                   IN VARCHAR2 CHARACTER SET ANY_CS,
                                     escape_reserved_chars IN BOOLEAN DEFAULT FALSE,
                                     url_charset           IN VARCHAR2 DEFAULT 'al32utf8')
  RETURN VARCHAR2 AS
  L_TMP            VARCHAR2(6000);
  L_BAD            VARCHAR2(100) DEFAULT ' >%}\~];?@&<#{|^[`/:=$+''"';
  l_reserved_chars VARCHAR2(100) DEFAULT ';/?:@&=+$[]';
  L_CHAR           CHAR(1 CHAR);

BEGIN
  IF (url IS NULL) THEN
    RETURN NULL;
  END IF;
  if not escape_reserved_chars then
    L_BAD := translate(L_BAD, l_reserved_chars, '');
  end if;
  FOR I IN 1 .. LENGTH(url) LOOP
    L_CHAR := SUBSTR(url, I, 1);
    IF (INSTR(L_BAD, L_CHAR) > 0 or ascii(L_CHAR) > 255) THEN
      L_TMP := L_TMP || regexp_replace(rawtohex(utl_raw.cast_to_raw(convert(L_CHAR,
                                                                            url_charset))),
                                       '(.{2})',
                                       '%\1');
    ELSE
      L_TMP := L_TMP || L_CHAR;
    END IF;
  END LOOP;
  RETURN L_TMP;
END URLENCODE;
/

For the above usage example, use the URLENCODE function just created to utl_url.escape replacement can be executed and the output result is correct

UNESCAPE

UTL_URL.UNESCAPE (
   url            IN VARCHAR2 CHARACTER SET ANY_CS,
   url_charset    IN VARCHAR2 DEFAULT utl_http.body_charset)
RETURN VARCHAR2;

UNESCAPE has two parameters: the url string that has been urldecode d and the url character set
Usage examples

--with GBK Character set pair URL Restore
select utl_url.unescape('https://www.darkathena.top/archives/%CE%D2%BF%AA%B2%A9%C1%CB','ZHS16GBK') from dual;
--https://www.darkathena.top/archives / I started blogging

--with UTF8 Character set pair URL Restore
select utl_url.unescape('https://www.darkathena.top/archives/%E6%88%91%E5%BC%80%E5%8D%9A%E4%BA%86','AL32UTF8') from dual;
--https://www.darkathena.top/archives / I started blogging

This seems a little more difficult to deal with than the above one, because I consider that the number of bytes in different character sets is different, but later I think, you can not consider the character set first, convert ordinary characters into binary data, remove the percentage sign from hexadecimal characters, and directly use it as binary data in hexadecimal form, and spell all binary data together, Finally, it can be converted into a displayable string according to the specified character set
The code is as follows

CREATE OR REPLACE FUNCTION URLDECODE(url         IN VARCHAR2 CHARACTER SET ANY_CS,
                                     url_charset IN VARCHAR2 DEFAULT 'AL32UTF8')
  RETURN VARCHAR2 IS
  L_RETURN VARCHAR2(2000);
BEGIN
  select CONVERT(UTL_RAW.cast_to_varchar2(LISTAGG(CASE
                                                    WHEN LENGTH(A) = 3 THEN
                                                     HEXTORAW(REPLACE(A, '%'))
                                                    ELSE
                                                     UTL_RAW.cast_to_raw(A)
                                                  END) --within group (order by 1) --18c The following line of notes is cancelled
                                                  ),
                 url_charset)
    INTO L_RETURN
    from (select REGEXP_SUBSTR(url, '(%.{2}|.)', 1, LEVEL) A
            from DUAL
          CONNECT BY LENGTH(REGEXP_SUBSTR(url, '(%.{2}|.)', 1, LEVEL)) > 0) A;
  RETURN L_RETURN;
END;
/

Similarly, the new function urlcode can completely replace utl_url.unescape usage

Now that the logic has been sorted out, the next step is to migrate to openGauss

openGauss(postgresql) compatible code

The syntax and functions of openGauss are quite different from those of oracle, but since the logic has been written, it is not troublesome to change it. Just pay attention to using built-in functions as much as possible to improve efficiency

https://gitee.com/darkathena/opengauss-oracle/blob/main/oracle-package/utl_url.sql

create schema UTL_URL;

CREATE OR REPLACE FUNCTION UTL_URL.escape(url IN TEXT, escape_reserved_chars IN BOOL DEFAULT FALSE, url_charset IN TEXT DEFAULT 'UTF8')
RETURNS TEXT
LANGUAGE plpgsql
IMMUTABLE NOT FENCED NOT SHIPPABLE
AS $$
declare
  L_TMP            TEXT DEFAULT '';
  L_BAD            TEXT DEFAULT ' >%}\~];?@&<#{|^[`/:=$+''"';
  l_reserved_chars TEXT DEFAULT ';/?:@&=+$[]';
  L_CHAR           TEXT;
BEGIN
  IF (url IS NULL) THEN
    RETURN NULL;
  END IF;

  if not escape_reserved_chars then
    L_BAD := translate(L_BAD, l_reserved_chars, '');
  end if;

  FOR I IN 1..LENGTH(url) LOOP
    L_CHAR := SUBSTR(url, I, 1);
    IF (INSTR(L_BAD, L_CHAR) > 0 or ascii(L_CHAR) > 255) THEN
      L_TMP := L_TMP || regexp_replace(upper(REPLACE(convert_TO(L_CHAR, url_charset)::TEXT,'\x','')),'(.{2})',
                                       '%\1','g');
    ELSE
      L_TMP := L_TMP || L_CHAR;
    END IF;
  END LOOP;
  RETURN L_TMP;
END; $$;
/
 
CREATE OR REPLACE FUNCTION UTL_URL.unescape(url IN TEXT, url_charset IN TEXT DEFAULT 'UTF8')
RETURNS TEXT
LANGUAGE sql
IMMUTABLE NOT FENCED NOT SHIPPABLE
AS $$
  select CONVERT_FROM(string_agg(CASE
                                   WHEN LENGTH(A) = 3 THEN
                                     decode(REPLACE(A, '%'), 'HEX')
                                   ELSE
                                     A :: bytea
                                 END, '' :: bytea), url_charset) 
    from (select a
            from (select (regexp_matches(url, '(%..|.)', 'g')) [ 1 ] a ) ) A;
$$;
/

test

select utl_url.escape('https://www.darkathena.top/archives / I started a blog ', true, URL_ charset=>'GBK') union all
select utl_url.escape('https://www.darkathena.top/archives / I started blogging ', URL_ charset=>'GBK')  union all
select utl_url.escape('https://www.darkathena.top/archives / I started blogging ', URL_ charset=>'UTF8')  union all
select utl_url.unescape('https://www.darkathena.top/archives/%CE%D2%BF%AA%B2%A9%C1%CB','GBK')  union all
select utl_url.unescape('https://www.darkathena.top/archives/%E6%88%91%E5%BC%80%E5%8D%9A%E4%BA%86','UTF8') ;

In addition, if it is opengauss2 1.0 or above, and the A compatibility mode is selected when installing the database, you can also use the package method without creating A new schema

CREATE OR REPLACE package pg_catalog.UTL_URL
as
  function escape(url IN TEXT, escape_reserved_chars IN BOOL DEFAULT FALSE, url_charset IN TEXT DEFAULT 'UTF8')
  RETURN TEXT;

  FUNCTION unescape(url IN TEXT, url_charset IN TEXT DEFAULT 'UTF8')
  RETURN TEXT;
end UTL_URL;
/

CREATE OR REPLACE package body pg_catalog.UTL_URL
as
  function escape(url IN TEXT, escape_reserved_chars IN BOOL DEFAULT FALSE, url_charset IN TEXT DEFAULT 'UTF8')
  RETURN TEXT
  IMMUTABLE NOT FENCED NOT SHIPPABLE
  AS
    L_TMP            TEXT DEFAULT '';
    L_BAD            TEXT DEFAULT ' >%}\~];?@&<#{|^[`/:=$+''"';
    l_reserved_chars TEXT DEFAULT ';/?:@&=+$[]';
    L_CHAR           TEXT;
  BEGIN
    IF (url IS NULL) THEN
      RETURN NULL;
    END IF;

    if not escape_reserved_chars then
      L_BAD := translate(L_BAD, l_reserved_chars, '');
    end if;

    FOR I IN 1.. LENGTH(url) LOOP
      L_CHAR := SUBSTR(url, I, 1);
      IF (INSTR(L_BAD, L_CHAR) > 0 or ascii(L_CHAR) > 255) THEN
        L_TMP := L_TMP || regexp_replace(upper(REPLACE(convert_TO(L_CHAR, url_charset)::TEXT,'\x','')),'(.{2})',
                                       '%\1','g');
      ELSE
        L_TMP := L_TMP || L_CHAR;
      END IF;
    END LOOP;
    RETURN L_TMP;
  END;

  FUNCTION unescape(url IN TEXT, url_charset IN TEXT DEFAULT 'UTF8')
  RETURN TEXT
  IMMUTABLE NOT FENCED NOT SHIPPABLE
  AS
    l_return text;
  begin
    select CONVERT_FROM(string_agg(CASE
                                     WHEN LENGTH(A) = 3 THEN
                                       decode(REPLACE(A, '%'), 'HEX')
                                     ELSE
                                       A :: bytea
                                   END, '' :: bytea), url_charset) into l_return
      from (select a
              from (select (regexp_matches(url, '(%..|.)', 'g')) [ 1 ] a ) ) A;
    return l_return;
  end;
end UTL_URL;
/

Keywords: Database Oracle PostgreSQL opengauss

Added by Bryan Ando on Thu, 03 Feb 2022 10:04:16 +0200