;;;; This is a simple, non-validating XML parser for LAML together with XML pretty printing support (to come). ;;;; As of the current version, the parser is by no means complete. Nevertheless, it is useful tool for parsing most ;;;; everyday XML documents to a Lisp data structure. <p> Given a well-formed XML document ;;;; this parser returns a Lisp tree structure that represents the parse tree of the XML document. ;;;; This parser skips any element of the XML prolog (front matters) including the DTD (if any). ;;;; The parser also skips XML comments. ;;;; The parser handles start tags, end tags, and empty tags (in this parser called start-end tags). ;;;; Entities and their declarations are not handled at all.<p> ;;;; The top level functions are xml-parse and xml-parse-file. The xml-parser can be loaded as a library as well.<p> ;;;; There exists <a href="../doc/xml-parser.html" target="_top">elucidative documentation</a> of this parser. ;;;; This tool loads laml.scm together with the general, collect-skip and file-read libraries (load (string-append laml-dir "laml.scm")) (lib-load "general.scm") (lib-load "collect-skip.scm") (lib-load "file-read.scm") ;;; The format of the parse tree. ;;; A <em>parse tree</em> T produced by this tool is of the form ;;; <pre> (tree N ST1 ST2 ... STn) </pre> ;;; where STi, i=1..n are ST1 of the same form and N is a node (see below). tag is a symbol (for tagging a syntax tree). ;;; A leaf node N may be of the form ;;; <pre> (tree N) </pre> ;;; or just N if N is a string (corresponding to textual contents) or an empty tag (a tag without contents).<p> ;;; An <em>inner node</em> of a parse tree corresponds to a tag (an element) with contents. Such a node is represented ;;; by the following 'tag structure': ;;; <pre> (tag kind tag-name . attr-info) </pre> ;;; tag is a symbol (for tagging). kind is either start or start-end (both symbols). ;;; tag-name is a string. Attr-info is the attribute on property list format.<p> ;;; A <em>terminal node</em> may be a start-end node, or just a contents string. End tags are not represented ;;; in the parse tree.<p> ;;; Here is an example of a start-end node (empty node) with two properties: ;;; <pre> (tag start-end "title" role "xxx" size "5") </pre> ; --------------------------------------------------------------------------------------------------- ; Parse specific error and message funtions.![]()
![]()
(define (parse-error . x) (display-message (string-append "XML parse error: " (apply string-append (map as-string x)))) (parser-status) (error "STOPPING THE PARSER"))![]()
![]()
(define (parse-message . x) (if xml-parse-verbose (display-message (string-append (apply string-append (map as-string x)))))) ; --------------------------------------------------------------------------------------------------- ; Overall functions![]()
![]()
(define (skip-white-space) (skip-while is-white-space?)) ; Skip white space and XML comments![]()
![]()
(define (skip-white-space-and-comments) (skip-white-space) (ensure-look-ahead 4) (if (and (not end-of-file?) (match-look-ahead? "<!--")) (begin (parse-message "Skipping comment") (read-a-string 4) (skip-until-string "-->" #t) (skip-white-space-and-comments)))) ; -------------------------------------------------------------------------------------------------- ;;; Top level functions. ;; Top level parse function which takes a xml file name as input, and delivers on an laml file. ;; file-path is a file path (relative or absolute) with or without an extension. The defaul extension is xml. ;; The function read file-path.xml and delivers its result on file-path.laml.![]()
![]()
(define (parse-xml-file file-path) (let ((init-path (file-name-initial-path file-path)) (file-name-prop (file-name-proper file-path)) (ext (file-name-extension file-path)) ) (reset-xml-parser) (let* ((input-port (open-input-file (string-append init-path file-name-prop "." (if (empty-string? ext) "xml" ext))))) (set! ip input-port) (let ((parse-tree (parse-xml-ip)) (target-file-name (string-append init-path file-name-prop "." "lsp"))) (set! resulting-parse-tree parse-tree) (if (file-exists? target-file-name) (delete-file target-file-name)) (let ((op (open-output-file target-file-name))) (write parse-tree op) (close-output-port op))) (display-message (string-append "DONE. The result is in the file " file-name-prop "." "lsp.")) (display-message "The result is also in the variable resulting-parse-tree for interactive use.") (close-input-port ip)))) ;; This function parses a file and return the parse tree. ;; file-path is a file path (relative or absolute) without any extension.![]()
![]()
(define (parse-xml file-path) (let ((init-path (file-name-initial-path file-path)) (file-name-prop (file-name-proper file-path)) (ext (file-name-extension file-path)) ) (reset-xml-parser) (let* ((input-port (open-input-file (string-append init-path file-name-prop "." (if (empty-string? ext) "xml" ext))))) (set! ip input-port) (let ((parse-tree (parse-xml-ip))) (close-input-port ip) parse-tree))))![]()
![]()
(define (reset-xml-parser) (reset-look-ahead-buffer) (set! parse-stack '())) ; --------------------------------------------------------------------------------------------------- ;::parse-stack:: ; Parse state: the parse stack![]()
![]()
(define xml-parse-verbose #f)![]()
![]()
(define parse-stack '())![]()
![]()
(define (parse-stack-push x) (set! parse-stack (cons x parse-stack)))![]()
![]()
(define (parse-stack-pop) (if (not (parse-stack-empty?)) (set! parse-stack (cdr parse-stack)) (parse-error (string-append "Trying to pop an empty parse stack"))))![]()
![]()
(define (parse-stack-top) (if (not (parse-stack-empty?)) (car parse-stack) (parse-error (string-append "Trying to access the top of an empty parse stack"))))![]()
![]()
(define (parse-stack-empty?) (null? parse-stack))![]()
![]()
(define (parse-stack-but-top) (if (not (parse-stack-empty?)) (cdr parse-stack) (parse-error (string-append "Trying to access the top of an empty parse stack"))))![]()
![]()
(define (parse-stack-top-and-pop) (if (not (parse-stack-empty?)) (let ((res (car parse-stack))) (set! parse-stack (cdr parse-stack)) res ) (parse-error (string-append "Trying to access the top of an empty parse stack")))) ; --------------------------------------------------------------------------------------------------- ; Tag structure functions![]()
![]()
(define (make-tag-structure kind tag-name attribute-property-list) (cons 'tag (cons kind (cons tag-name attribute-property-list))))![]()
![]()
(define (kind-of-tag-structure tag-structure) (as-symbol (cadr tag-structure)))![]()
![]()
(define (tag-of-tag-structure tag-structure) (as-string (caddr tag-structure)))![]()
![]()
(define (attributes-of-tag-structure tag-structure) (cdddr tag-structure)) ; Return the attribute value of attribute-key (a symbol) in attribute-list. ; If attribute does not exist, return #f.![]()
![]()
(define (attribute-value attribute-key attribute-list) (let ((a-list (propertylist-to-alist attribute-list))) (defaulted-get attribute-key a-list #f))) ; ------------------------------------------------------- ; Tag structure predicates![]()
![]()
(define (start-node? x) (and (list x) (>= (length x) 2) (eq? (car x) 'tag) (eq? (cadr x) 'start)))![]()
![]()
(define (start-end-node? x) (and (list x) (>= (length x) 2) (eq? (car x) 'tag) (eq? (cadr x) 'start-end))) ; --------------------------------------------------------------------------------------------------- ; Parse tree functions ;; A global varible holding the latest produced parse tree![]()
![]()
(define resulting-parse-tree #f)![]()
![]()
(define (make-parse-tree node subtree-list) (cons 'tree (cons node subtree-list)))![]()
![]()
(define (root-of-parse-tree tree) (cadr tree))![]()
![]()
(define (subtrees-of-parse-tree tree) (cddr tree))![]()
![]()
(define (terminal-node? tree) (or (string? tree) (and (list? tree) (= 2 (length tree)) (string? (cadr tree))) (and (list? tree) (= 2 (length tree)) (start-end-node? (cadr tree))) (start-end-node? tree)))![]()
![]()
(define (inner-node? tree) (not (terminal-node? tree))) ; Return the node of a tree, which may be a contents string or a tag structure.![]()
![]()
(define (node-of-tree tree) (cond ((terminal-node? tree) (cond ((string? tree) tree) ((start-end-node? tree) tree) (else (root-of-parse-tree tree)))) ((inner-node? tree) (root-of-parse-tree tree)))) ; Return the node contents or tag name of the node (a symbol). An ad hoc function. ; The node may have been extracted ny node-of-tree.![]()
![]()
(define (node-info node) (cond ((text-contents-entry? node) (if (string? node) node (car node))) ((tag-entry? node) (as-symbol (tag-of-tag-structure node))) (else (error "node-info: Should not happen")))) ; Return the attributes of a node. If there is no attributes of the node, return the empty list.![]()
![]()
(define (node-attribute-info node) (cond ((text-contents-entry? node) '()) ((tag-entry? node) (attributes-of-tag-structure node)) (else (error "node-attribute-inf: Should not happen")))) ;; Return a list of result-transformed nodes that satisfy the node-interesting? predicate.![]()
![]()
(define (traverse-and-collect-from-parse-tree tree node-interesting? result-transformer) (cond ((and (terminal-node? tree) (node-interesting? tree)) (list (result-transformer tree))) ((and (terminal-node? tree) (not (node-interesting? tree))) '()) ((inner-node? tree) (let ((subtree-results (map (lambda (subtr) (traverse-and-collect-from-parse-tree subtr node-interesting? result-transformer)) (subtrees-of-parse-tree tree)))) (if (node-interesting? tree) (cons (result-transformer tree) (flatten subtree-results)) (flatten subtree-results)))))) ; (define (parse-tree-2-laml tree) ; (cond ((inner-node? tree) ; (let ((root (root-of-parse-tree tree)) ; (subtrees (subtrees-of-parse-tree tree))) ; (append ; (list (as-symbol (downcase-string (tag-of-tag-structure root)))) ; (lamlify-attributes (attributes-of-tag-structure root)) ; (map parse-tree-2-laml subtrees)))) ; ((terminal-node? tree) ; (cond ((text-contents-entry? tree) (node-info tree)) ; ((start-end-node? tree) ; (append ; (list (as-symbol (downcase-string (tag-of-tag-structure tree)))) ; (lamlify-attributes (attributes-of-tag-structure tree)))) ; (else (error "parse-tree-2-laml: Should not happen 1")))) ; (error "parse-tree-2-laml: Should not happen 2"))) ; ; (define (lamlify-attributes attr-list) ; (cond ((null? attr-list) '()) ; (else (let ((key (as-symbol (downcase-string (as-string (car attr-list))))) ; (val (cadr attr-list))) ; (cons (list 'quote key) ; (cons val (lamlify-attributes (cddr attr-list))))))))![]()
![]()
(define (parse-tree-2-laml tree output-file) (let ((res (parse-tree-2-laml-string tree)) (prefix "(load (string-append laml-dir \"laml.scm\")) (style \"simple-html4.0-loose\") (generic-page \"???\" ") (suffix ")")) (write-text-file (string-append prefix res suffix) output-file)))![]()
![]()
(define (parse-tree-2-laml-string tree) (cond ((inner-node? tree) (let ((root (root-of-parse-tree tree)) (subtrees (subtrees-of-parse-tree tree))) (string-append "(" (downcase-string (tag-of-tag-structure root)) " " (lamlify-attributes-string (attributes-of-tag-structure root)) (list-to-string (map parse-tree-2-laml-string subtrees) " ") ")"))) ((terminal-node? tree) (cond ((text-contents-entry? tree) (string-it (node-info tree))) ((start-end-node? tree) (string-append "(" (downcase-string (tag-of-tag-structure tree)) " " (lamlify-attributes-string (attributes-of-tag-structure tree)) ")")) (else (error "parse-tree-2-laml: Should not happen 1")))) (error "parse-tree-2-laml: Should not happen 2")))![]()
![]()
(define (lamlify-attributes-string attr-list) (cond ((null? attr-list) "") (else (let ((key (downcase-string (as-string (car attr-list)))) (val (cadr attr-list))) (string-append "'" key " " (string-it val) " " (lamlify-attributes-string (cddr attr-list))))))) ; --------------------------------------------------------------------------------------------------- ; Predicates on trees and tag structures: ; Most useful to make sense of stack entries. Can also be used to distinguish various kinds of subtrees ; of a tree from each other.![]()
![]()
(define (tag-entry? x) (and (list? x) (>= (length x) 2) (eq? (car x) 'tag)))![]()
![]()
(define (start-tag-entry? x) (and (tag-entry? x) (eq? (cadr x) 'start)))![]()
![]()
(define (start-end-tag-entry? x) (and (tag-entry? x) (eq? (cadr x) 'start-end)))![]()
![]()
(define (tree-entry? x) (and (list? x) (>= (length x) 2) (eq? (car x) 'tree)))![]()
![]()
(define (text-contents-entry? x) (or (string? x) (and (list? x) (= 1 (length x)) (string? (car x))))) ; --------------------------------------------------------------------------------------------------- ; Substantial parse functions.![]()
![]()
(define (parse-xml-ip) (if (not end-of-file?) (skip-white-space)) (skip-front-matters) (if (not end-of-file?) (parse-xml-balanced-expression) '() ; the empty tree ) ) ; Skip all XML document prefix stuff, including comments. ; As of this version, it cannot skip an inline DTD (causes parse error).![]()
![]()
(define (skip-front-matters) (parse-message "Skipping front matter") (skip-white-space) (ensure-look-ahead 2) (cond ((match-look-ahead? "<?") (read-a-string 2) (skip-until-string "?>" #t) (skip-front-matters)) ((match-look-ahead? "<!") ;(let ((comment (collect-balanced-until (lambda (ch) (eq? ch #\<)) (lambda (ch) (eq? ch #\>))))) 'do-nothing) (skip-front-matters))))
![]()
![]()
![]()
![]()
![]()
![]()
![]()
(define (parse-xml-balanced-expression) (parse-message "Parsing balanced expression.") (if (not end-of-file?) (skip-white-space-and-comments)) (let ((what (what-is-ahead))) ;- looks ahead - does not read (cond ((eq? what 'tag) (let* ((tag (read-tag)) ;
(kind (kind-of-tag-structure tag))) (cond ((eq? kind 'start) ;
(parse-stack-push tag) (read-and-push-subtrees-until-end-tag (tag-of-tag-structure tag)) ; consumes the end tag too (build-tree-from-stack (tag-of-tag-structure tag)) ; return inner node ) ((eq? kind 'start-end) tag ; return terminal node ) ((eq? kind 'end) (parse-error "end tag encountered without matching start tag: " (as-string tag))) (else (parse-error "parse-xml-balanced-expression: unknown kind of tag")) ) ) ) ((eq? what 'contents-string) (let ((contents-string (read-contents-string))) contents-string)) (else (parse-error "parse-xml-balanced-expression: Parse problem")))))
![]()
![]()
(define (what-is-ahead) (ensure-look-ahead 2) (let ((res (match-look-ahead? "<"))) (if res 'tag 'contents-string)))![]()
![]()
(define (read-contents-string) (remove-redundant-white-space ; may be wrong in pre-like contexts (collect-until (lambda (ch) (eq? ch #\<))))) ; remove extra white space from str by returning a truncated string. Turn extra white space in spaces. A pure function.![]()
![]()
(define (remove-redundant-white-space str) (remove-redundant-white-space-1 str "" 0 (string-length str) #f))![]()
![]()
(define (remove-redundant-white-space-1 str res i lgt removing) (cond ((= i lgt) res) ((and removing (is-white-space? (string-ref str i))) (remove-redundant-white-space-1 str res (+ i 1) lgt #t)) ((and removing (not (is-white-space? (string-ref str i)))) (remove-redundant-white-space-1 str (string-append res (as-string (string-ref str i))) (+ i 1) lgt #f)) ((and (not removing) (is-white-space? (string-ref str i))) (remove-redundant-white-space-1 str (string-append res (as-string #\space)) (+ i 1) lgt #t)) ((and (not removing) (not (is-white-space? (string-ref str i)))) (remove-redundant-white-space-1 str (string-append res (as-string (string-ref str i))) (+ i 1) lgt #f)) (else (error "remove-redundant-white-space-1: should not happen"))))![]()
![]()
![]()
![]()
(define (read-and-push-subtrees-until-end-tag end-tag-name) (skip-white-space-and-comments) (let* ((n (+ (string-length end-tag-name) 3)) ;(end-tag-string (string-append (as-string #\<) (as-string #\/) end-tag-name (as-string #\>))) ) (ensure-look-ahead n) (if (match-look-ahead? end-tag-string) ;
(begin (read-a-string n) ; finish (skip-white-space-and-comments)) (let ((subtree (parse-xml-balanced-expression))) ;
(parse-stack-push subtree) (skip-white-space-and-comments) (read-and-push-subtrees-until-end-tag end-tag-name) ; tail recursive parsing of contents ))))
![]()
![]()
![]()
(define (build-tree-from-stack end-tag-name) (build-tree-from-stack-1 end-tag-name '()))![]()
![]()
(define (build-tree-from-stack-1 tag-name tree-list) (let ((top (parse-stack-top-and-pop))) (if (and (start-tag-entry? top) (matches-stack-entry top tag-name)) (make-parse-tree top tree-list) (build-tree-from-stack-1 tag-name (cons top tree-list)))))![]()
![]()
(define (matches-stack-entry top-tag-structure tag-name) (equal? (downcase-string (tag-of-tag-structure top-tag-structure)) (downcase-string tag-name))) ; We know that we are just in front of a tag. Return a tag structure.![]()
![]()
![]()
(define (read-tag) (parse-message "Reading tag") (skip-white-space) (ensure-look-ahead 1) (if (match-look-ahead? "<") (read-a-char) (parse-error "'<' expected")) (ensure-look-ahead 1) (if (match-look-ahead? "/") ; end tag (begin ;(read-a-char) (let ((tag-name (collect-until (lambda (ch) (or (eq? ch #\>) (is-white-space? ch)))))) (ensure-look-ahead 1) (if (match-look-ahead? ">") (read-a-char) ; > (parse-error "An end tag must not contain anything after the tag name")) (parse-message " " "End: " tag-name) (make-tag-structure 'end tag-name '()))) (let ((tag-name (collect-until (lambda (ch) (or (eq? ch #\>) (eq? ch #\/) (is-white-space? ch)))))) (skip-white-space) (ensure-look-ahead 2) ;
(cond ((and (>= (string-length (max-look-ahead-prefix)) 2) (match-look-ahead? "/>")) ;
- weird (read-a-string 2) (make-tag-structure 'start-end tag-name '())) ((and (>= (string-length (max-look-ahead-prefix)) 1) (match-look-ahead? ">")) ;
(read-a-char) (make-tag-structure 'start tag-name '())) (else (let ((attribute-list (read-tag-attributes))) ;
- white space processes at this point (ensure-look-ahead 2) (cond ((and (>= (string-length (max-look-ahead-prefix)) 2) (match-look-ahead? "/>")) (read-a-string 2) (parse-message " " "Start end: " tag-name ". Attributes: " (as-string attribute-list)) (make-tag-structure 'start-end tag-name attribute-list)) ((and (>= (string-length (max-look-ahead-prefix)) 1) (match-look-ahead? ">")) (read-a-char) (parse-message " " "Start: " tag-name ". Attributes: " (as-string attribute-list)) (make-tag-structure 'start tag-name attribute-list)) (else (parse-error "read-tag: end of tag expected")))) ))))) ; Return attribute value pairs as a property list ; Expect that we are at the first character the first attribute name
![]()
![]()
(define (read-tag-attributes) (skip-white-space) (let ((attr-val (read-attribute-value-pair))) ; cons-pair or #f. (skip-white-space) (if attr-val (cons (car attr-val) (cons (cdr attr-val) (read-tag-attributes) ; recursive reading ) ) '()))) ; read a single attribute value pair from the input port and return it is a cons pair. ; If no more attributes exists (we see end of tag characters) return #f.![]()
![]()
(define (read-attribute-value-pair) (ensure-look-ahead 2) (if (or (match-look-ahead? "/>") (match-look-ahead? ">")) #f (let* ((attr-name (collect-until (lambda (ch) (or (eq? ch #\=) (is-white-space? ch)))))) (skip-white-space) (ensure-look-ahead 1) (if (match-look-ahead? "=") (begin (read-a-char) (skip-white-space)) (parse-error "= expected after attribute value")) (ensure-look-ahead 1) (if (match-look-ahead? "\"") (begin (read-a-char) ; reading first " (let ((value (collect-until (char-predicate #\")))) (read-a-char) ; reading second " (skip-white-space) (cons (as-symbol attr-name) value))) (parse-error "Attribute value in string quotes expected"))))) ; --------------------------------------------------------------------------------------------------- ; Utility functions ;; Return a list of of attribute values of attr-key in the parse tree.![]()
![]()
(define (collect-attributes-in-tree tree attr-key) (filter (lambda (x) (if x #t #f)) (traverse-and-collect-from-parse-tree tree (lambda (tr) (tag-entry? (node-of-tree tr))) (lambda (tr) (attribute-value attr-key (attributes-of-tag-structure (node-of-tree tr))))))) ;; Display parser status in case of error in the parse process.![]()
![]()
(define (parser-status) (display (stack-status)) (newline) (display (input-status)) )![]()
![]()
(define (stack-status) (string-append "THE PARSE STACK: " (as-string #\newline) (stack-status-1 parse-stack)))![]()
![]()
(define (stack-status-1 lst) (if (null? lst) "" (let ((top (car lst))) (string-append (cond ((text-contents-entry? top) (as-string (node-info top))) ((tag-entry? top) (as-string (node-info top))) ((tree-entry? top) (as-string (node-info (node-of-tree top)))) (else "???")) (as-string #\newline) (stack-status-1 (cdr lst))))))![]()
![]()
(define (input-status) (ensure-look-ahead 100) (string-append "PREFIX OF REMAINING INPUT: " (as-string #\newline) (as-string #\") (max-look-ahead-prefix) (as-string #\") (as-string #\newline)))