171 lines
7.1 KiB
Common Lisp
171 lines
7.1 KiB
Common Lisp
(in-package :cl-xdg-trash/url-encode)
|
|
|
|
(declaim (ftype (function (character) t) url-unreserved-character-p))
|
|
(defun url-unreserved-character-p (char)
|
|
"Return non-nil if CHAR is an unreserved character in a URL."
|
|
(or (member char '(#\- #\_ #\~ #\.) :test #'eql)
|
|
(let ((code (char-code char)))
|
|
(or (and (>= code (char-code #\a))
|
|
(<= code (char-code #\z)))
|
|
(and (>= code (char-code #\A))
|
|
(<= code (char-code #\Z)))))))
|
|
|
|
(deftype utf-8-code-point ()
|
|
'(integer 0 #x10FFFF))
|
|
|
|
(deftype utf-8-middle-byte ()
|
|
'(integer #b10000000 #b10111111))
|
|
|
|
(deftype utf-8-char ()
|
|
'(or
|
|
(cons (integer 0 #b1010111) null)
|
|
|
|
(cons (integer #b11000000 #b11011111) (cons utf-8-middle-byte null))
|
|
|
|
(cons (integer #b11100000 #b11101111)
|
|
(cons utf-8-middle-byte (cons utf-8-middle-byte null)))
|
|
|
|
(cons (integer #b11110000 #b11110111)
|
|
(cons utf-8-middle-byte (cons utf-8-middle-byte (cons utf-8-middle-byte null))))))
|
|
|
|
(declaim (ftype (function (character) list-of-code-points) utf-8-char))
|
|
(defun utf-8-encode-char (char)
|
|
"Encode CHAR, a character, to a list of bytes that make up its UTF-8
|
|
representation."
|
|
(let* ((code (char-code char))
|
|
(length (integer-length code)))
|
|
(cond
|
|
((<= length 7) (list code))
|
|
((<= length 11) (list (+ (ash code -6) #b11000000)
|
|
(+ (logand code #b111111) #b10000000)))
|
|
((<= length 16) (list (+ (ash code -12) #b11100000)
|
|
(+ (logand (ash code -6) #b111111) #b10000000)
|
|
(+ (logand code #b111111) #b10000000)))
|
|
(t (list (+ (ash code -24) #b11110000)
|
|
(+ (logand (ash code -12) #b111111) #b10000000)
|
|
(+ (logand (ash code -6) #b111111) #b10000000)
|
|
(+ (logand code #b111111) #b10000000))))))
|
|
|
|
(declaim (ftype (function (utf-8-char) character) utf-8-decode-char))
|
|
(defun utf-8-decode-char (char)
|
|
"Decode a list of up to 4 UTF-8 bytes to and return the character the
|
|
represent."
|
|
(check-type char utf-8-char)
|
|
(destructuring-bind (&optional part1 part2 part3 part4) char
|
|
(declare (type (or null (integer 0 #xFF)) part1 part2 part3 part4))
|
|
(code-char
|
|
(cond
|
|
;; 4 byte
|
|
(part4 (+ (ash (logand part1 #b111) 18)
|
|
(ash (logand part2 #b111111) 12)
|
|
(ash (logand part3 #b111111) 6)
|
|
(logand part4 #b111111)))
|
|
;; 3 bytes
|
|
(part3 (+ (ash (logand part1 #b1111) 12)
|
|
(ash (logand part2 #b111111) 6)
|
|
(logand part3 #b111111)))
|
|
;; 2 bytes
|
|
(part2 (+ (ash (logand part1 #b11111) 6)
|
|
(logand part2 #b111111)))
|
|
;; 1 byte
|
|
(t part1)))))
|
|
|
|
(declaim (ftype (function (character) string) escape-char-for-url))
|
|
(defun escape-char-for-url (char)
|
|
"Escape CHAR such that it is safe to include in a URL."
|
|
(if (url-unreserved-character-p char)
|
|
(string char)
|
|
(format nil "~{%~X~}" (utf-8-encode-char char))))
|
|
|
|
(declaim (ftype (function (string) string) url-encode))
|
|
(defun url-encode (string)
|
|
"URL encode (percent escape) STRING."
|
|
(format nil "~{~A~}" (map 'list 'escape-char-for-url string)))
|
|
|
|
(define-condition url-decode-error (error)
|
|
((string :accessor url-decode-error-string
|
|
:initarg :string
|
|
:type string
|
|
:documentation "The string which could not be decoded.")
|
|
(index :accessor url-decode-error-index
|
|
:initarg :index
|
|
:type integer
|
|
:documentation "The index of the problematic character."))
|
|
(:report (lambda (condition stream)
|
|
(format stream "Failed to URL decode ~S starting from index ~S."
|
|
(url-decode-error-string condition)
|
|
(url-decode-error-index condition)))))
|
|
|
|
(declaim (ftype (function (integer) integer)
|
|
count-url-characters-from-first-byte))
|
|
(defun count-url-characters-from-first-byte (first-byte)
|
|
"Return the number of characters following FIRST-BYTE in a URL encoded
|
|
string. FIRST-BYTE is not included. This does not work if FIRST-BYTE starts a
|
|
one byte escape sequence."
|
|
(case (ash first-byte -4)
|
|
(#b1111 3)
|
|
(#b1110 2)
|
|
(t 1)))
|
|
|
|
(declaim (ftype (function ((integer 0 #xFF) string integer)
|
|
(values character integer))
|
|
parse-next-n-url-characters))
|
|
(defun parse-next-n-url-characters (first-byte string start)
|
|
"Prase the next escaped sequence from STRING starting at START which began
|
|
with FIRST-BYTE. FIRST-BYTE should not be included."
|
|
(let ((count (count-url-characters-from-first-byte first-byte)))
|
|
(when (> 0 (- (length string) start (* count 3)))
|
|
(error 'url-decode-error :string string :index start))
|
|
(loop for time below count
|
|
for i = start then (+ i 3)
|
|
unless (eql #\% (aref string i))
|
|
do (error 'url-decode-error :string string :index i)
|
|
collect (handler-case
|
|
(parse-integer string :start (1+ i) :end (+ i 3)
|
|
:radix 16)
|
|
(parse-error ()
|
|
(error 'url-decode-error :string string :index (1+ i))))
|
|
into out
|
|
finally (return (values (handler-case
|
|
(utf-8-decode-char (cons first-byte out))
|
|
(type-error ()
|
|
(error 'url-decode-error :string string
|
|
:index i)))
|
|
(* (1+ count) 3))))))
|
|
|
|
(declaim (ftype (function (string &key (:start integer))
|
|
(values character integer))
|
|
parse-url-encoded-char))
|
|
(defun parse-url-encoded-char (string &key (start 0))
|
|
"Parse the next URL encoded character in STRING, starting at START, and return
|
|
it. This function assumes that the character at START is a percent sign."
|
|
(when (> 0 (- (length string) start 3))
|
|
(error 'url-decode-error :string string :index start))
|
|
(let ((first-byte (handler-case
|
|
(parse-integer string :start (1+ start) :end (+ start 3)
|
|
:radix 16)
|
|
(parse-error ()
|
|
(error 'url-decode-error :string string
|
|
:index (1+ start))))))
|
|
(if (zerop (logand first-byte #b10000000))
|
|
(values (code-char first-byte) 3)
|
|
(parse-next-n-url-characters first-byte string (+ start 3)))))
|
|
|
|
(declaim (ftype (function (string &key (:start integer) (:end integer))
|
|
string)
|
|
url-decode))
|
|
(defun url-decode (string &key (start 0) (end (length string)))
|
|
"Decode STRING, which is percent escaped."
|
|
(coerce (loop with index = start
|
|
while (< index end)
|
|
for char = (aref string index)
|
|
if (eql char #\%)
|
|
collect (multiple-value-bind (decoded skip-count)
|
|
(parse-url-encoded-char string :start index)
|
|
(incf index skip-count)
|
|
decoded )
|
|
else
|
|
collect char
|
|
and do (incf index))
|
|
'string))
|