Initial commit

This commit is contained in:
2025-09-30 14:29:54 -07:00
commit 0419339a36
7 changed files with 983 additions and 0 deletions

170
url-encode.lisp Normal file
View File

@ -0,0 +1,170 @@
(in-package :cl-xdg-trash/url-encode)
(declaim (ftype (function (character) t) url-unreserved-character-p))
(defun url-unreserved-character-p (char)
"Return non-nil if CHAR is an unreserved character in a URL."
(or (member char '(#\- #\_ #\~ #\.) :test #'eql)
(let ((code (char-code char)))
(or (and (>= code (char-code #\a))
(<= code (char-code #\z)))
(and (>= code (char-code #\A))
(<= code (char-code #\Z)))))))
(deftype utf-8-code-point ()
'(integer 0 #x10FFFF))
(deftype utf-8-middle-byte ()
'(integer #b10000000 #b10111111))
(deftype utf-8-char ()
'(or
(cons (integer 0 #b1010111) null)
(cons (integer #b11000000 #b11011111) (cons utf-8-middle-byte null))
(cons (integer #b11100000 #b11101111)
(cons utf-8-middle-byte (cons utf-8-middle-byte null)))
(cons (integer #b11110000 #b11110111)
(cons utf-8-middle-byte (cons utf-8-middle-byte (cons utf-8-middle-byte null))))))
(declaim (ftype (function (character) list-of-code-points) utf-8-char))
(defun utf-8-encode-char (char)
"Encode CHAR, a character, to a list of bytes that make up its UTF-8
representation."
(let* ((code (char-code char))
(length (integer-length code)))
(cond
((<= length 7) (list code))
((<= length 11) (list (+ (ash code -6) #b11000000)
(+ (logand code #b111111) #b10000000)))
((<= length 16) (list (+ (ash code -12) #b11100000)
(+ (logand (ash code -6) #b111111) #b10000000)
(+ (logand code #b111111) #b10000000)))
(t (list (+ (ash code -24) #b11110000)
(+ (logand (ash code -12) #b111111) #b10000000)
(+ (logand (ash code -6) #b111111) #b10000000)
(+ (logand code #b111111) #b10000000))))))
(declaim (ftype (function (utf-8-char) character) utf-8-decode-char))
(defun utf-8-decode-char (char)
"Decode a list of up to 4 UTF-8 bytes to and return the character the
represent."
(check-type char utf-8-char)
(destructuring-bind (&optional part1 part2 part3 part4) char
(declare (type (or null (integer 0 #xFF)) part1 part2 part3 part4))
(code-char
(cond
;; 4 byte
(part4 (+ (ash (logand part1 #b111) 18)
(ash (logand part2 #b111111) 12)
(ash (logand part3 #b111111) 6)
(logand part4 #b111111)))
;; 3 bytes
(part3 (+ (ash (logand part1 #b1111) 12)
(ash (logand part2 #b111111) 6)
(logand part3 #b111111)))
;; 2 bytes
(part2 (+ (ash (logand part1 #b11111) 6)
(logand part2 #b111111)))
;; 1 byte
(t part1)))))
(declaim (ftype (function (character) string) escape-char-for-url))
(defun escape-char-for-url (char)
"Escape CHAR such that it is safe to include in a URL."
(if (url-unreserved-character-p char)
(string char)
(format nil "~{%~X~}" (utf-8-encode-char char))))
(declaim (ftype (function (string) string) url-encode))
(defun url-encode (string)
"URL encode (percent escape) STRING."
(format nil "~{~A~}" (map 'list 'escape-char-for-url string)))
(define-condition url-decode-error (error)
((string :accessor url-decode-error-string
:initarg :string
:type string
:documentation "The string which could not be decoded.")
(index :accessor url-decode-error-index
:initarg :index
:type integer
:documentation "The index of the problematic character."))
(:report (lambda (condition stream)
(format stream "Failed to URL decode ~S starting from index ~S."
(url-decode-error-string condition)
(url-decode-error-index condition)))))
(declaim (ftype (function (integer) integer)
count-url-characters-from-first-byte))
(defun count-url-characters-from-first-byte (first-byte)
"Return the number of characters following FIRST-BYTE in a URL encoded
string. FIRST-BYTE is not included. This does not work if FIRST-BYTE starts a
one byte escape sequence."
(case (ash first-byte -4)
(#b1111 3)
(#b1110 2)
(t 1)))
(declaim (ftype (function ((integer 0 #xFF) string integer)
(values character integer))
parse-next-n-url-characters))
(defun parse-next-n-url-characters (first-byte string start)
"Prase the next escaped sequence from STRING starting at START which began
with FIRST-BYTE. FIRST-BYTE should not be included."
(let ((count (count-url-characters-from-first-byte first-byte)))
(when (> 0 (- (length string) start (* count 3)))
(error 'url-decode-error :string string :index start))
(loop for time below count
for i = start then (+ i 3)
unless (eql #\% (aref string i))
do (error 'url-decode-error :string string :index i)
collect (handler-case
(parse-integer string :start (1+ i) :end (+ i 3)
:radix 16)
(parse-error ()
(error 'url-decode-error :string string :index (1+ i))))
into out
finally (return (values (handler-case
(utf-8-decode-char (cons first-byte out))
(type-error ()
(error 'url-decode-error :string string
:index i)))
(* (1+ count) 3))))))
(declaim (ftype (function (string &key (:start integer))
(values character integer))
parse-url-encoded-char))
(defun parse-url-encoded-char (string &key (start 0))
"Parse the next URL encoded character in STRING, starting at START, and return
it. This function assumes that the character at START is a percent sign."
(when (> 0 (- (length string) start 3))
(error 'url-decode-error :string string :index start))
(let ((first-byte (handler-case
(parse-integer string :start (1+ start) :end (+ start 3)
:radix 16)
(parse-error ()
(error 'url-decode-error :string string
:index (1+ start))))))
(if (zerop (logand first-byte #b10000000))
(values (code-char first-byte) 3)
(parse-next-n-url-characters first-byte string (+ start 3)))))
(declaim (ftype (function (string &key (:start integer) (:end integer))
string)
url-decode))
(defun url-decode (string &key (start 0) (end (length string)))
"Decode STRING, which is percent escaped."
(coerce (loop with index = start
while (< index end)
for char = (aref string index)
if (eql char #\%)
collect (multiple-value-bind (decoded skip-count)
(parse-url-encoded-char string :start index)
(incf index skip-count)
decoded )
else
collect char
and do (incf index))
'string))