(in-package :cl-xdg-trash/url-encode) ;; Some useful links ;; https://en.wikipedia.org/wiki/UTF-8 ;; https://en.wikipedia.org/wiki/Percent-encoding (declaim (ftype (function (character) t) url-unreserved-character-p)) (defun url-unreserved-character-p (char) "Return non-nil if CHAR is an unreserved character in a URL." (or (member char '(#\- #\_ #\~ #\.) :test #'eql) (let ((code (char-code char))) (or (and (>= code (char-code #\a)) (<= code (char-code #\z))) (and (>= code (char-code #\A)) (<= code (char-code #\Z))))))) (deftype utf-8-code-point () '(integer 0 #x10FFFF)) (deftype utf-8-middle-byte () '(integer #b10000000 #b10111111)) (deftype utf-8-char () '(or (cons (integer 0 #b1111111) null) (cons (integer #b11000000 #b11011111) (cons utf-8-middle-byte null)) (cons (integer #b11100000 #b11101111) (cons utf-8-middle-byte (cons utf-8-middle-byte null))) (cons (integer #b11110000 #b11110111) (cons utf-8-middle-byte (cons utf-8-middle-byte (cons utf-8-middle-byte null)))))) (declaim (ftype (function (character) list-of-code-points) utf-8-char)) (defun utf-8-encode-char (char) "Encode CHAR, a character, to a list of bytes that make up its UTF-8 representation." (let* ((code (char-code char)) (length (integer-length code))) (cond ((<= length 7) (list code)) ((<= length 11) (list (+ (ash code -6) #b11000000) (+ (logand code #b111111) #b10000000))) ((<= length 16) (list (+ (ash code -12) #b11100000) (+ (logand (ash code -6) #b111111) #b10000000) (+ (logand code #b111111) #b10000000))) (t (list (+ (ash code -24) #b11110000) (+ (logand (ash code -12) #b111111) #b10000000) (+ (logand (ash code -6) #b111111) #b10000000) (+ (logand code #b111111) #b10000000)))))) (declaim (ftype (function (utf-8-char) character) utf-8-decode-char)) (defun utf-8-decode-char (char) "Decode a list of up to 4 UTF-8 bytes to and return the character the represent." (check-type char utf-8-char) (destructuring-bind (&optional part1 part2 part3 part4) char (declare (type (or null (integer 0 #xFF)) part1 part2 part3 part4)) (code-char (cond ;; 4 byte (part4 (+ (ash (logand part1 #b111) 18) (ash (logand part2 #b111111) 12) (ash (logand part3 #b111111) 6) (logand part4 #b111111))) ;; 3 bytes (part3 (+ (ash (logand part1 #b1111) 12) (ash (logand part2 #b111111) 6) (logand part3 #b111111))) ;; 2 bytes (part2 (+ (ash (logand part1 #b11111) 6) (logand part2 #b111111))) ;; 1 byte (t part1))))) (declaim (ftype (function (character) string) escape-char-for-url)) (defun escape-char-for-url (char) "Escape CHAR such that it is safe to include in a URL." (if (url-unreserved-character-p char) (string char) (format nil "~{%~X~}" (utf-8-encode-char char)))) (declaim (ftype (function (string) string) url-encode)) (defun url-encode (string) "URL encode (percent escape) STRING." (format nil "~{~A~}" (map 'list 'escape-char-for-url string))) (define-condition url-decode-error (error) ((string :accessor url-decode-error-string :initarg :string :type string :documentation "The string which could not be decoded.") (index :accessor url-decode-error-index :initarg :index :type integer :documentation "The index of the problematic character.")) (:report (lambda (condition stream) (format stream "Failed to URL decode ~S starting from index ~S." (url-decode-error-string condition) (url-decode-error-index condition))))) (declaim (ftype (function (integer) integer) count-url-characters-from-first-byte)) (defun count-url-characters-from-first-byte (first-byte) "Return the number of characters following FIRST-BYTE in a URL encoded string. FIRST-BYTE is not included. This does not work if FIRST-BYTE starts a one byte escape sequence." (case (ash first-byte -4) (#b1111 3) (#b1110 2) (t 1))) (declaim (ftype (function ((integer 0 #xFF) string integer) (values character integer)) parse-next-n-url-characters)) (defun parse-next-n-url-characters (first-byte string start) "Prase the next escaped sequence from STRING starting at START which began with FIRST-BYTE. FIRST-BYTE should not be included." (let ((count (count-url-characters-from-first-byte first-byte))) (when (> 0 (- (length string) start (* count 3))) (error 'url-decode-error :string string :index start)) (loop for time below count for i = start then (+ i 3) unless (eql #\% (aref string i)) do (error 'url-decode-error :string string :index i) collect (handler-case (parse-integer string :start (1+ i) :end (+ i 3) :radix 16) (parse-error () (error 'url-decode-error :string string :index (1+ i)))) into out finally (return (values (handler-case (utf-8-decode-char (cons first-byte out)) (type-error () (error 'url-decode-error :string string :index i))) (* (1+ count) 3)))))) (declaim (ftype (function (string &key (:start integer)) (values character integer)) parse-url-encoded-char)) (defun parse-url-encoded-char (string &key (start 0)) "Parse the next URL encoded character in STRING, starting at START, and return it. This function assumes that the character at START is a percent sign." (when (> 0 (- (length string) start 3)) (error 'url-decode-error :string string :index start)) (let ((first-byte (handler-case (parse-integer string :start (1+ start) :end (+ start 3) :radix 16) (parse-error () (error 'url-decode-error :string string :index (1+ start)))))) (if (zerop (logand first-byte #b10000000)) (values (code-char first-byte) 3) (parse-next-n-url-characters first-byte string (+ start 3))))) (declaim (ftype (function (string &key (:start integer) (:end integer)) string) url-decode)) (defun url-decode (string &key (start 0) (end (length string))) "Decode STRING, which is percent escaped." (coerce (loop with index = start while (< index end) for char = (aref string index) if (eql char #\%) collect (multiple-value-bind (decoded skip-count) (parse-url-encoded-char string :start index) (incf index skip-count) decoded ) else collect char and do (incf index)) 'string))