185 lines
		
	
	
		
			7.8 KiB
		
	
	
	
		
			Common Lisp
		
	
	
	
	
	
			
		
		
	
	
			185 lines
		
	
	
		
			7.8 KiB
		
	
	
	
		
			Common Lisp
		
	
	
	
	
	
| (in-package :cl-xdg-trash/url-encode)
 | |
| ;; Some useful links
 | |
| ;; https://en.wikipedia.org/wiki/UTF-8
 | |
| ;; https://en.wikipedia.org/wiki/Percent-encoding
 | |
| 
 | |
| (declaim (ftype (function (character) t) url-unreserved-character-p))
 | |
| (defun url-unreserved-character-p (char)
 | |
|   "Return non-nil if CHAR is an unreserved character in a URL."
 | |
|   (or (member char '(#\- #\_ #\~ #\.) :test #'eql)
 | |
|       (let ((code (char-code char)))
 | |
|         (or (and (>= code (char-code #\a))
 | |
|                  (<= code (char-code #\z)))
 | |
|             (and (>= code (char-code #\A))
 | |
|                  (<= code (char-code #\Z)))
 | |
|             (and (>= code (char-code #\0))
 | |
|                  (<= code (char-code #\9)))))))
 | |
| 
 | |
| (deftype utf-8-code-point ()
 | |
|   '(integer 0 #x10FFFF))
 | |
| 
 | |
| (deftype utf-8-middle-byte ()
 | |
|   '(integer #b10000000 #b10111111))
 | |
| 
 | |
| (deftype utf-8-char ()
 | |
|   '(or
 | |
|     (cons (integer 0 #b1111111) null)
 | |
| 
 | |
|     (cons (integer #b11000000 #b11011111) (cons utf-8-middle-byte null))
 | |
| 
 | |
|     (cons (integer #b11100000 #b11101111)
 | |
|      (cons utf-8-middle-byte (cons utf-8-middle-byte null)))
 | |
| 
 | |
|     (cons (integer #b11110000 #b11110111)
 | |
|      (cons utf-8-middle-byte (cons utf-8-middle-byte (cons utf-8-middle-byte null))))))
 | |
| 
 | |
| (declaim (ftype (function (character) list-of-code-points) utf-8-char))
 | |
| (defun utf-8-encode-char (char)
 | |
|   "Encode CHAR, a character, to a list of bytes that make up its UTF-8
 | |
| representation."
 | |
|   (let* ((code (char-code char))
 | |
|          (length (integer-length code)))
 | |
|     (cond
 | |
|       ((<= length 7) (list code))
 | |
|       ((<= length 11) (list (+ (ash code -6) #b11000000)
 | |
|                             (+ (logand code #b111111) #b10000000)))
 | |
|       ((<= length 16) (list (+ (ash code -12) #b11100000)
 | |
|                             (+ (logand (ash code -6) #b111111) #b10000000)
 | |
|                             (+ (logand code #b111111) #b10000000)))
 | |
|       (t (list (+ (ash code -24) #b11110000)
 | |
|                (+ (logand (ash code -12) #b111111) #b10000000)
 | |
|                (+ (logand (ash code -6) #b111111) #b10000000)
 | |
|                (+ (logand code #b111111) #b10000000))))))
 | |
| 
 | |
| (declaim (ftype (function (utf-8-char) character) utf-8-decode-char))
 | |
| (defun utf-8-decode-char (char)
 | |
|   "Decode a list of up to 4 UTF-8 bytes to and return the character the
 | |
| represent."
 | |
|   (check-type char utf-8-char)
 | |
|   (destructuring-bind (&optional part1 part2 part3 part4) char
 | |
|     (declare (type (or null (integer 0 #xFF)) part1 part2 part3 part4))
 | |
|     (code-char
 | |
|      (cond
 | |
|        ;; 4 byte
 | |
|        (part4 (+ (ash (logand part1 #b111) 18)
 | |
|                  (ash (logand part2 #b111111) 12)
 | |
|                  (ash (logand part3 #b111111) 6)
 | |
|                  (logand part4 #b111111)))
 | |
|        ;; 3 bytes
 | |
|        (part3 (+ (ash (logand part1 #b1111) 12)
 | |
|                  (ash (logand part2 #b111111) 6)
 | |
|                  (logand part3 #b111111)))
 | |
|        ;; 2 bytes
 | |
|        (part2 (+ (ash (logand part1 #b11111) 6)
 | |
|                  (logand part2 #b111111)))
 | |
|        ;; 1 byte
 | |
|        (t part1)))))
 | |
| 
 | |
| (declaim (ftype (function (character &key (:safe-chars list)) list)
 | |
|                 escape-char-for-url))
 | |
| (defun escape-char-for-url (char &key safe-chars)
 | |
|   "Escape CHAR such that it is safe to include in a URL."
 | |
|   (if (or (url-unreserved-character-p char) (member char safe-chars
 | |
|                                                     :test #'eql))
 | |
|       (list char)
 | |
|       (coerce (format nil "~{%~X~}" (utf-8-encode-char char)) 'list)))
 | |
| 
 | |
| (declaim (ftype (function (string &key (:start integer) (:end integer)
 | |
|                                   (:safe-chars list))
 | |
|                           string)
 | |
|                 url-encode))
 | |
| (defun url-encode (string &key (start 0) (end (length string)) safe-chars)
 | |
|   "URL encode (percent escape) STRING. SAFE-CHARS is a list of characters that
 | |
| should not be encoded."
 | |
|   (coerce (loop for i upfrom start below end
 | |
|                 for char = (aref string i)
 | |
|                 append (escape-char-for-url char :safe-chars safe-chars))
 | |
|           'string))
 | |
| 
 | |
| (define-condition url-decode-error (error)
 | |
|   ((string :accessor url-decode-error-string
 | |
|            :initarg :string
 | |
|            :type string
 | |
|            :documentation "The string which could not be decoded.")
 | |
|    (index :accessor url-decode-error-index
 | |
|           :initarg :index
 | |
|           :type integer
 | |
|           :documentation "The index of the problematic character."))
 | |
|   (:report (lambda (condition stream)
 | |
|              (format stream "Failed to URL decode ~S starting from index ~S."
 | |
|                      (url-decode-error-string condition)
 | |
|                      (url-decode-error-index condition)))))
 | |
| 
 | |
| (declaim (ftype (function (integer) integer)
 | |
|                 count-url-characters-from-first-byte))
 | |
| (defun count-url-characters-from-first-byte (first-byte)
 | |
|   "Return the number of characters following FIRST-BYTE in a URL encoded
 | |
| string. FIRST-BYTE is not included. This does not work if FIRST-BYTE starts a
 | |
| one byte escape sequence."
 | |
|   (case (ash first-byte -4)
 | |
|     (#b1111 3)
 | |
|     (#b1110 2)
 | |
|     (t 1)))
 | |
| 
 | |
| (declaim (ftype (function ((integer 0 #xFF) string integer)
 | |
|                           (values character integer))
 | |
|                 parse-next-n-url-characters))
 | |
| (defun parse-next-n-url-characters (first-byte string start)
 | |
|   "Prase the next escaped sequence from STRING starting at START which began
 | |
| with FIRST-BYTE. FIRST-BYTE should not be included."
 | |
|   (let ((count (count-url-characters-from-first-byte first-byte)))
 | |
|     (when (> 0 (- (length string) start (* count 3)))
 | |
|       (error 'url-decode-error :string string :index start))
 | |
|     (loop for time below count
 | |
|           for i = start then (+ i 3)
 | |
|           unless (eql #\% (aref string i))
 | |
|             do (error 'url-decode-error :string string :index i)
 | |
|           collect (handler-case
 | |
|                       (parse-integer string :start (1+ i) :end (+ i 3)
 | |
|                                             :radix 16)
 | |
|                     (parse-error ()
 | |
|                       (error 'url-decode-error :string string :index (1+ i))))
 | |
|             into out
 | |
|           finally (return (values (handler-case
 | |
|                                       (utf-8-decode-char (cons first-byte out))
 | |
|                                     (type-error ()
 | |
|                                       (error 'url-decode-error :string string
 | |
|                                                                :index i)))
 | |
|                                   (* (1+ count) 3))))))
 | |
| 
 | |
| (declaim (ftype (function (string &key (:start integer))
 | |
|                           (values character integer))
 | |
|                 parse-url-encoded-char))
 | |
| (defun parse-url-encoded-char (string &key (start 0))
 | |
|   "Parse the next URL encoded character in STRING, starting at START, and return
 | |
| it. This function assumes that the character at START is a percent sign."
 | |
|   (when (> 0 (- (length string) start 3))
 | |
|     (error 'url-decode-error :string string :index start))
 | |
|   (let ((first-byte (handler-case
 | |
|                         (parse-integer string :start (1+ start) :end (+ start 3)
 | |
|                                               :radix 16)
 | |
|                       (parse-error ()
 | |
|                         (error 'url-decode-error :string string
 | |
|                                                  :index (1+ start))))))
 | |
|     (if (zerop (logand first-byte #b10000000))
 | |
|         (values (code-char first-byte) 3)
 | |
|         (parse-next-n-url-characters first-byte string (+ start 3)))))
 | |
| 
 | |
| (declaim (ftype (function (string &key (:start integer) (:end integer))
 | |
|                           string)
 | |
|                 url-decode))
 | |
| (defun url-decode (string &key (start 0) (end (length string)))
 | |
|   "Decode STRING, which is percent escaped."
 | |
|   (coerce (loop with index = start
 | |
|                 while (< index end)
 | |
|                 for char = (aref string index)
 | |
|                 if (eql char #\%)
 | |
|                   collect (multiple-value-bind (decoded skip-count)
 | |
|                               (parse-url-encoded-char string :start index)
 | |
|                             (incf index skip-count)
 | |
|                             decoded )
 | |
|                 else
 | |
|                   collect char
 | |
|                   and do (incf index))
 | |
|           'string))
 |