Files
cl-xdg-trash/url-encode.lisp

185 lines
7.8 KiB
Common Lisp

(in-package :cl-xdg-trash/url-encode)
;; Some useful links
;; https://en.wikipedia.org/wiki/UTF-8
;; https://en.wikipedia.org/wiki/Percent-encoding
(declaim (ftype (function (character) t) url-unreserved-character-p))
(defun url-unreserved-character-p (char)
"Return non-nil if CHAR is an unreserved character in a URL."
(or (member char '(#\- #\_ #\~ #\.) :test #'eql)
(let ((code (char-code char)))
(or (and (>= code (char-code #\a))
(<= code (char-code #\z)))
(and (>= code (char-code #\A))
(<= code (char-code #\Z)))
(and (>= code (char-code #\0))
(<= code (char-code #\9)))))))
(deftype utf-8-code-point ()
'(integer 0 #x10FFFF))
(deftype utf-8-middle-byte ()
'(integer #b10000000 #b10111111))
(deftype utf-8-char ()
'(or
(cons (integer 0 #b1111111) null)
(cons (integer #b11000000 #b11011111) (cons utf-8-middle-byte null))
(cons (integer #b11100000 #b11101111)
(cons utf-8-middle-byte (cons utf-8-middle-byte null)))
(cons (integer #b11110000 #b11110111)
(cons utf-8-middle-byte (cons utf-8-middle-byte (cons utf-8-middle-byte null))))))
(declaim (ftype (function (character) list-of-code-points) utf-8-char))
(defun utf-8-encode-char (char)
"Encode CHAR, a character, to a list of bytes that make up its UTF-8
representation."
(let* ((code (char-code char))
(length (integer-length code)))
(cond
((<= length 7) (list code))
((<= length 11) (list (+ (ash code -6) #b11000000)
(+ (logand code #b111111) #b10000000)))
((<= length 16) (list (+ (ash code -12) #b11100000)
(+ (logand (ash code -6) #b111111) #b10000000)
(+ (logand code #b111111) #b10000000)))
(t (list (+ (ash code -24) #b11110000)
(+ (logand (ash code -12) #b111111) #b10000000)
(+ (logand (ash code -6) #b111111) #b10000000)
(+ (logand code #b111111) #b10000000))))))
(declaim (ftype (function (utf-8-char) character) utf-8-decode-char))
(defun utf-8-decode-char (char)
"Decode a list of up to 4 UTF-8 bytes to and return the character the
represent."
(check-type char utf-8-char)
(destructuring-bind (&optional part1 part2 part3 part4) char
(declare (type (or null (integer 0 #xFF)) part1 part2 part3 part4))
(code-char
(cond
;; 4 byte
(part4 (+ (ash (logand part1 #b111) 18)
(ash (logand part2 #b111111) 12)
(ash (logand part3 #b111111) 6)
(logand part4 #b111111)))
;; 3 bytes
(part3 (+ (ash (logand part1 #b1111) 12)
(ash (logand part2 #b111111) 6)
(logand part3 #b111111)))
;; 2 bytes
(part2 (+ (ash (logand part1 #b11111) 6)
(logand part2 #b111111)))
;; 1 byte
(t part1)))))
(declaim (ftype (function (character &key (:safe-chars list)) list)
escape-char-for-url))
(defun escape-char-for-url (char &key safe-chars)
"Escape CHAR such that it is safe to include in a URL."
(if (or (url-unreserved-character-p char) (member char safe-chars
:test #'eql))
(list char)
(coerce (format nil "~{%~X~}" (utf-8-encode-char char)) 'list)))
(declaim (ftype (function (string &key (:start integer) (:end integer)
(:safe-chars list))
string)
url-encode))
(defun url-encode (string &key (start 0) (end (length string)) safe-chars)
"URL encode (percent escape) STRING. SAFE-CHARS is a list of characters that
should not be encoded."
(coerce (loop for i upfrom start below end
for char = (aref string i)
append (escape-char-for-url char :safe-chars safe-chars))
'string))
(define-condition url-decode-error (error)
((string :accessor url-decode-error-string
:initarg :string
:type string
:documentation "The string which could not be decoded.")
(index :accessor url-decode-error-index
:initarg :index
:type integer
:documentation "The index of the problematic character."))
(:report (lambda (condition stream)
(format stream "Failed to URL decode ~S starting from index ~S."
(url-decode-error-string condition)
(url-decode-error-index condition)))))
(declaim (ftype (function (integer) integer)
count-url-characters-from-first-byte))
(defun count-url-characters-from-first-byte (first-byte)
"Return the number of characters following FIRST-BYTE in a URL encoded
string. FIRST-BYTE is not included. This does not work if FIRST-BYTE starts a
one byte escape sequence."
(case (ash first-byte -4)
(#b1111 3)
(#b1110 2)
(t 1)))
(declaim (ftype (function ((integer 0 #xFF) string integer)
(values character integer))
parse-next-n-url-characters))
(defun parse-next-n-url-characters (first-byte string start)
"Prase the next escaped sequence from STRING starting at START which began
with FIRST-BYTE. FIRST-BYTE should not be included."
(let ((count (count-url-characters-from-first-byte first-byte)))
(when (> 0 (- (length string) start (* count 3)))
(error 'url-decode-error :string string :index start))
(loop for time below count
for i = start then (+ i 3)
unless (eql #\% (aref string i))
do (error 'url-decode-error :string string :index i)
collect (handler-case
(parse-integer string :start (1+ i) :end (+ i 3)
:radix 16)
(parse-error ()
(error 'url-decode-error :string string :index (1+ i))))
into out
finally (return (values (handler-case
(utf-8-decode-char (cons first-byte out))
(type-error ()
(error 'url-decode-error :string string
:index i)))
(* (1+ count) 3))))))
(declaim (ftype (function (string &key (:start integer))
(values character integer))
parse-url-encoded-char))
(defun parse-url-encoded-char (string &key (start 0))
"Parse the next URL encoded character in STRING, starting at START, and return
it. This function assumes that the character at START is a percent sign."
(when (> 0 (- (length string) start 3))
(error 'url-decode-error :string string :index start))
(let ((first-byte (handler-case
(parse-integer string :start (1+ start) :end (+ start 3)
:radix 16)
(parse-error ()
(error 'url-decode-error :string string
:index (1+ start))))))
(if (zerop (logand first-byte #b10000000))
(values (code-char first-byte) 3)
(parse-next-n-url-characters first-byte string (+ start 3)))))
(declaim (ftype (function (string &key (:start integer) (:end integer))
string)
url-decode))
(defun url-decode (string &key (start 0) (end (length string)))
"Decode STRING, which is percent escaped."
(coerce (loop with index = start
while (< index end)
for char = (aref string index)
if (eql char #\%)
collect (multiple-value-bind (decoded skip-count)
(parse-url-encoded-char string :start index)
(incf index skip-count)
decoded )
else
collect char
and do (incf index))
'string))