001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     * 
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     * 
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */ 
017    package org.apache.commons.betwixt;
018     /**
019      * <p><code>XMLUtils</code> contains basic utility methods for XML.</p>
020      * 
021      * <p>The code for {@link #isWellFormedXMLName} is based on code in 
022      * <code>org.apache.xerces.util.XMLChar</code> 
023      * in <a href='http://xerces.apache.org/xerces2-j/index.html'>Apache Xerces</a>.
024      * The authors of this class are credited below.</p>
025      *
026      * @author Glenn Marcy, IBM
027      * @author Andy Clark, IBM
028      * @author Eric Ye, IBM
029      * @author Arnaud  Le Hors, IBM
030      * @author Rahul Srivastava, Sun Microsystems Inc.  
031      *
032      * @author Robert Burrell Donkin
033      * @since 0.5
034      */
035    public class XMLUtils {
036    
037        // Constants
038        //-------------------------------------------------------------------------   
039    
040        /** Escaped <code>&lt;</code> entity */
041        public static final String LESS_THAN_ENTITY = "&lt;";
042        /** Escaped <code>&gt;</code> entity */
043        public static final String GREATER_THAN_ENTITY = "&gt;";
044        /** Escaped <code>&amp;</code> entity */
045        public static final String AMPERSAND_ENTITY = "&amp;";
046        /** Escaped <code>'</code> entity */
047        public static final String APOSTROPHE_ENTITY = "&apos;";
048        /** Escaped <code>"</code> entity */
049        public static final String QUOTE_ENTITY = "&quot;";
050    
051        // Used by isWellFormedXMLName
052        /** Name start character mask. */
053        private static final int MASK_NAME_START = 0x01;
054        /** Name character mask. */
055        private static final int MASK_NAME = 0x02;
056        
057        // Class attributes
058        //-------------------------------------------------------------------------   
059        
060        /** Character flags. */
061        private static final byte[] CHARS = new byte[1 << 16];
062    
063        //
064        // Static initialization
065        //
066    
067        static {
068    
069            //
070            // [4] NameChar ::= Letter | Digit | '.' | '-' | '_' | ':' |
071            //                  CombiningChar | Extender
072            //
073    
074            int nameChar[] = { 
075                0x002D, 0x002E, // '-' and '.'
076            };
077    
078            //
079            // [5] Name ::= (Letter | '_' | ':') (NameChar)*
080            //
081    
082            int nameStartChar[] = { 
083                0x003A, 0x005F, // ':' and '_'
084            };
085    
086            //
087            // [84] Letter ::= BaseChar | Ideographic
088            //
089    
090            int letterRange[] = {
091                // BaseChar
092                0x0041, 0x005A, 0x0061, 0x007A, 0x00C0, 0x00D6, 0x00D8, 0x00F6,
093                0x00F8, 0x0131, 0x0134, 0x013E, 0x0141, 0x0148, 0x014A, 0x017E,
094                0x0180, 0x01C3, 0x01CD, 0x01F0, 0x01F4, 0x01F5, 0x01FA, 0x0217,
095                0x0250, 0x02A8, 0x02BB, 0x02C1, 0x0388, 0x038A, 0x038E, 0x03A1,
096                0x03A3, 0x03CE, 0x03D0, 0x03D6, 0x03E2, 0x03F3, 0x0401, 0x040C,
097                0x040E, 0x044F, 0x0451, 0x045C, 0x045E, 0x0481, 0x0490, 0x04C4,
098                0x04C7, 0x04C8, 0x04CB, 0x04CC, 0x04D0, 0x04EB, 0x04EE, 0x04F5,
099                0x04F8, 0x04F9, 0x0531, 0x0556, 0x0561, 0x0586, 0x05D0, 0x05EA,
100                0x05F0, 0x05F2, 0x0621, 0x063A, 0x0641, 0x064A, 0x0671, 0x06B7,
101                0x06BA, 0x06BE, 0x06C0, 0x06CE, 0x06D0, 0x06D3, 0x06E5, 0x06E6,
102                0x0905, 0x0939, 0x0958, 0x0961, 0x0985, 0x098C, 0x098F, 0x0990,
103                0x0993, 0x09A8, 0x09AA, 0x09B0, 0x09B6, 0x09B9, 0x09DC, 0x09DD,
104                0x09DF, 0x09E1, 0x09F0, 0x09F1, 0x0A05, 0x0A0A, 0x0A0F, 0x0A10,
105                0x0A13, 0x0A28, 0x0A2A, 0x0A30, 0x0A32, 0x0A33, 0x0A35, 0x0A36,
106                0x0A38, 0x0A39, 0x0A59, 0x0A5C, 0x0A72, 0x0A74, 0x0A85, 0x0A8B,
107                0x0A8F, 0x0A91, 0x0A93, 0x0AA8, 0x0AAA, 0x0AB0, 0x0AB2, 0x0AB3,
108                0x0AB5, 0x0AB9, 0x0B05, 0x0B0C, 0x0B0F, 0x0B10, 0x0B13, 0x0B28,
109                0x0B2A, 0x0B30, 0x0B32, 0x0B33, 0x0B36, 0x0B39, 0x0B5C, 0x0B5D,
110                0x0B5F, 0x0B61, 0x0B85, 0x0B8A, 0x0B8E, 0x0B90, 0x0B92, 0x0B95,
111                0x0B99, 0x0B9A, 0x0B9E, 0x0B9F, 0x0BA3, 0x0BA4, 0x0BA8, 0x0BAA,
112                0x0BAE, 0x0BB5, 0x0BB7, 0x0BB9, 0x0C05, 0x0C0C, 0x0C0E, 0x0C10,
113                0x0C12, 0x0C28, 0x0C2A, 0x0C33, 0x0C35, 0x0C39, 0x0C60, 0x0C61,
114                0x0C85, 0x0C8C, 0x0C8E, 0x0C90, 0x0C92, 0x0CA8, 0x0CAA, 0x0CB3,
115                0x0CB5, 0x0CB9, 0x0CE0, 0x0CE1, 0x0D05, 0x0D0C, 0x0D0E, 0x0D10,
116                0x0D12, 0x0D28, 0x0D2A, 0x0D39, 0x0D60, 0x0D61, 0x0E01, 0x0E2E,
117                0x0E32, 0x0E33, 0x0E40, 0x0E45, 0x0E81, 0x0E82, 0x0E87, 0x0E88,
118                0x0E94, 0x0E97, 0x0E99, 0x0E9F, 0x0EA1, 0x0EA3, 0x0EAA, 0x0EAB,
119                0x0EAD, 0x0EAE, 0x0EB2, 0x0EB3, 0x0EC0, 0x0EC4, 0x0F40, 0x0F47,
120                0x0F49, 0x0F69, 0x10A0, 0x10C5, 0x10D0, 0x10F6, 0x1102, 0x1103,
121                0x1105, 0x1107, 0x110B, 0x110C, 0x110E, 0x1112, 0x1154, 0x1155,
122                0x115F, 0x1161, 0x116D, 0x116E, 0x1172, 0x1173, 0x11AE, 0x11AF,
123                0x11B7, 0x11B8, 0x11BC, 0x11C2, 0x1E00, 0x1E9B, 0x1EA0, 0x1EF9,
124                0x1F00, 0x1F15, 0x1F18, 0x1F1D, 0x1F20, 0x1F45, 0x1F48, 0x1F4D,
125                0x1F50, 0x1F57, 0x1F5F, 0x1F7D, 0x1F80, 0x1FB4, 0x1FB6, 0x1FBC,
126                0x1FC2, 0x1FC4, 0x1FC6, 0x1FCC, 0x1FD0, 0x1FD3, 0x1FD6, 0x1FDB,
127                0x1FE0, 0x1FEC, 0x1FF2, 0x1FF4, 0x1FF6, 0x1FFC, 0x212A, 0x212B,
128                0x2180, 0x2182, 0x3041, 0x3094, 0x30A1, 0x30FA, 0x3105, 0x312C,
129                0xAC00, 0xD7A3,
130                // Ideographic
131                0x3021, 0x3029, 0x4E00, 0x9FA5,
132            };
133            int letterChar[] = {
134                // BaseChar
135                0x0386, 0x038C, 0x03DA, 0x03DC, 0x03DE, 0x03E0, 0x0559, 0x06D5,
136                0x093D, 0x09B2, 0x0A5E, 0x0A8D, 0x0ABD, 0x0AE0, 0x0B3D, 0x0B9C,
137                0x0CDE, 0x0E30, 0x0E84, 0x0E8A, 0x0E8D, 0x0EA5, 0x0EA7, 0x0EB0,
138                0x0EBD, 0x1100, 0x1109, 0x113C, 0x113E, 0x1140, 0x114C, 0x114E,
139                0x1150, 0x1159, 0x1163, 0x1165, 0x1167, 0x1169, 0x1175, 0x119E,
140                0x11A8, 0x11AB, 0x11BA, 0x11EB, 0x11F0, 0x11F9, 0x1F59, 0x1F5B,
141                0x1F5D, 0x1FBE, 0x2126, 0x212E,
142                // Ideographic
143                0x3007,
144            };
145    
146            //
147            // [87] CombiningChar ::= ...
148            //
149    
150            int combiningCharRange[] = {
151                0x0300, 0x0345, 0x0360, 0x0361, 0x0483, 0x0486, 0x0591, 0x05A1,
152                0x05A3, 0x05B9, 0x05BB, 0x05BD, 0x05C1, 0x05C2, 0x064B, 0x0652,
153                0x06D6, 0x06DC, 0x06DD, 0x06DF, 0x06E0, 0x06E4, 0x06E7, 0x06E8,
154                0x06EA, 0x06ED, 0x0901, 0x0903, 0x093E, 0x094C, 0x0951, 0x0954,
155                0x0962, 0x0963, 0x0981, 0x0983, 0x09C0, 0x09C4, 0x09C7, 0x09C8,
156                0x09CB, 0x09CD, 0x09E2, 0x09E3, 0x0A40, 0x0A42, 0x0A47, 0x0A48,
157                0x0A4B, 0x0A4D, 0x0A70, 0x0A71, 0x0A81, 0x0A83, 0x0ABE, 0x0AC5,
158                0x0AC7, 0x0AC9, 0x0ACB, 0x0ACD, 0x0B01, 0x0B03, 0x0B3E, 0x0B43,
159                0x0B47, 0x0B48, 0x0B4B, 0x0B4D, 0x0B56, 0x0B57, 0x0B82, 0x0B83,
160                0x0BBE, 0x0BC2, 0x0BC6, 0x0BC8, 0x0BCA, 0x0BCD, 0x0C01, 0x0C03,
161                0x0C3E, 0x0C44, 0x0C46, 0x0C48, 0x0C4A, 0x0C4D, 0x0C55, 0x0C56,
162                0x0C82, 0x0C83, 0x0CBE, 0x0CC4, 0x0CC6, 0x0CC8, 0x0CCA, 0x0CCD,
163                0x0CD5, 0x0CD6, 0x0D02, 0x0D03, 0x0D3E, 0x0D43, 0x0D46, 0x0D48,
164                0x0D4A, 0x0D4D, 0x0E34, 0x0E3A, 0x0E47, 0x0E4E, 0x0EB4, 0x0EB9,
165                0x0EBB, 0x0EBC, 0x0EC8, 0x0ECD, 0x0F18, 0x0F19, 0x0F71, 0x0F84,
166                0x0F86, 0x0F8B, 0x0F90, 0x0F95, 0x0F99, 0x0FAD, 0x0FB1, 0x0FB7,
167                0x20D0, 0x20DC, 0x302A, 0x302F,
168            };
169    
170            int combiningCharChar[] = {
171                0x05BF, 0x05C4, 0x0670, 0x093C, 0x094D, 0x09BC, 0x09BE, 0x09BF,
172                0x09D7, 0x0A02, 0x0A3C, 0x0A3E, 0x0A3F, 0x0ABC, 0x0B3C, 0x0BD7,
173                0x0D57, 0x0E31, 0x0EB1, 0x0F35, 0x0F37, 0x0F39, 0x0F3E, 0x0F3F,
174                0x0F97, 0x0FB9, 0x20E1, 0x3099, 0x309A,
175            };
176    
177            //
178            // [88] Digit ::= ...
179            //
180    
181            int digitRange[] = {
182                0x0030, 0x0039, 0x0660, 0x0669, 0x06F0, 0x06F9, 0x0966, 0x096F,
183                0x09E6, 0x09EF, 0x0A66, 0x0A6F, 0x0AE6, 0x0AEF, 0x0B66, 0x0B6F,
184                0x0BE7, 0x0BEF, 0x0C66, 0x0C6F, 0x0CE6, 0x0CEF, 0x0D66, 0x0D6F,
185                0x0E50, 0x0E59, 0x0ED0, 0x0ED9, 0x0F20, 0x0F29,
186            };
187    
188            //
189            // [89] Extender ::= ...
190            //
191    
192            int extenderRange[] = {
193                0x3031, 0x3035, 0x309D, 0x309E, 0x30FC, 0x30FE,
194            };
195    
196            int extenderChar[] = {
197                0x00B7, 0x02D0, 0x02D1, 0x0387, 0x0640, 0x0E46, 0x0EC6, 0x3005,
198            };
199    
200            //
201            // Initialize
202            //
203    
204            // set name start characters
205            for (int i = 0; i < nameStartChar.length; i++) {
206                CHARS[nameStartChar[i]] |= MASK_NAME_START | MASK_NAME;
207            }
208            for (int i = 0; i < letterRange.length; i += 2) {
209                for (int j = letterRange[i]; j <= letterRange[i + 1]; j++) {
210                    CHARS[j] |= MASK_NAME_START | MASK_NAME;
211                }
212            }
213            for (int i = 0; i < letterChar.length; i++) {
214                CHARS[letterChar[i]] |= MASK_NAME_START | MASK_NAME;
215            }
216    
217            // set name characters
218            for (int i = 0; i < nameChar.length; i++) {
219                CHARS[nameChar[i]] |= MASK_NAME;
220            }
221            for (int i = 0; i < digitRange.length; i += 2) {
222                for (int j = digitRange[i]; j <= digitRange[i + 1]; j++) {
223                    CHARS[j] |= MASK_NAME;
224                }
225            }
226            for (int i = 0; i < combiningCharRange.length; i += 2) {
227                for (int j = combiningCharRange[i]; j <= combiningCharRange[i + 1]; j++) {
228                    CHARS[j] |= MASK_NAME;
229                }
230            }
231            for (int i = 0; i < combiningCharChar.length; i++) {
232                CHARS[combiningCharChar[i]] |= MASK_NAME;
233            }
234            for (int i = 0; i < extenderRange.length; i += 2) {
235                for (int j = extenderRange[i]; j <= extenderRange[i + 1]; j++) {
236                    CHARS[j] |= MASK_NAME;
237                }
238            }
239            for (int i = 0; i < extenderChar.length; i++) {
240                CHARS[extenderChar[i]] |= MASK_NAME;
241            }
242    
243        }
244            
245        // Constructor
246        //-------------------------------------------------------------------------   
247    
248        /** 
249         * <p>Constructor for use by tools that required <code>JavaBean</code> instances.</p>
250         * 
251         * <p>This constructor is public <strong>only</strong> 
252         * to permit tools that require a JavaBean instance to operate.
253         * <code>XMLUtils</code> instances should <strong>not</strong> be constructed in standard 
254         * programming. Instead, the class methods should be called directly.</p>
255         */
256        public XMLUtils() {}
257    
258        // Class methods
259        //-------------------------------------------------------------------------  
260        
261        /** 
262         * <p>Escape the <code>toString</code> of the given object.
263         * For use as body text.</p>
264         *
265         * @param value escape <code>value.toString()</code>
266         * @return text with escaped delimiters 
267         */
268        public static final String escapeBodyValue(Object value) {
269            StringBuffer buffer = new StringBuffer(value.toString());
270            for (int i=0, size = buffer.length(); i <size; i++) {
271                switch (buffer.charAt(i)) {
272                    case '<':
273                        buffer.replace(i, i+1, LESS_THAN_ENTITY);
274                        size += 3;
275                        i+=3;
276                        break;
277                     case '>':
278                        buffer.replace(i, i+1, GREATER_THAN_ENTITY);
279                        size += 3;
280                        i += 3;
281                        break;
282                     case '&':
283                        buffer.replace(i, i+1, AMPERSAND_ENTITY);
284                        size += 4;
285                        i += 4;
286                        break;        
287                }
288            }
289            return buffer.toString();
290        }
291    
292        /** 
293         * <p>Escape the <code>toString</code> of the given object.
294         * For use in an attribute value.</p>
295         *
296         * @param value escape <code>value.toString()</code>
297         * @return text with characters restricted (for use in attributes) escaped
298         */
299        public static final String escapeAttributeValue(Object value) {
300            StringBuffer buffer = new StringBuffer(value.toString());
301            for (int i=0, size = buffer.length(); i <size; i++) {
302                switch (buffer.charAt(i)) {
303                    case '<':
304                        buffer.replace(i, i+1, LESS_THAN_ENTITY);
305                        size += 3;
306                        i+=3;
307                        break;
308                     case '>':
309                        buffer.replace(i, i+1, GREATER_THAN_ENTITY);
310                        size += 3;
311                        i += 3;
312                        break;
313                     case '&':
314                        buffer.replace(i, i+1, AMPERSAND_ENTITY);
315                        size += 4;
316                        i += 4;
317                        break;
318                     case '\'':
319                        buffer.replace(i, i+1, APOSTROPHE_ENTITY);
320                        size += 5;
321                        i += 5;
322                        break;
323                     case '\"':
324                        buffer.replace(i, i+1, QUOTE_ENTITY);
325                        size += 5;
326                        i += 5;
327                        break;           
328                }
329            }
330            return buffer.toString();
331        }    
332        
333        
334        /**
335         * Escapes the given content suitable for insertion within a
336         * <code>CDATA</code> sequence.
337         * Within a <code>CDATA</code> section, only the <code>CDEnd</code>
338         * string ']]>' is recognized as markup.
339         * @param content the body content whose character data should 
340         * be escaped in a way appropriate for use within a <code>CDATA</code>
341         * section of xml.
342         * @return escaped character data, not null
343         */
344        public static final String escapeCDATAContent(String content) {
345            StringBuffer buffer = new StringBuffer(content);
346            escapeCDATAContent(buffer);
347            return buffer.toString();
348        }
349         
350        /**
351         * Escapes the given content suitable for insertion within a
352         * <code>CDATA</code> sequence.
353         * Within a <code>CDATA</code> section, only the <code>CDEnd</code>
354         * string ']]>' is recognized as markup.
355         * @param bufferedContent the body content within a buffer 
356         * whose character data should 
357         * be escaped in a way appropriate for use within a <code>CDATA</code>
358         * section of xml
359         */
360        public static final void escapeCDATAContent(StringBuffer bufferedContent) {
361            for (int i=2, size = bufferedContent.length(); i<size; i++) {
362                char at = bufferedContent.charAt(i);
363                if ( at == '>' 
364                    && bufferedContent.charAt(i-1) == ']' 
365                    && bufferedContent.charAt(i-2) == ']') {
366                        
367                        bufferedContent.replace(i, i+1, GREATER_THAN_ENTITY);
368                    size += 3;
369                    i+=3;
370                }
371            }
372        }    
373     
374        
375        /**
376         * <p>Is this string a well formed xml name?</p>
377         *
378         * <p>Only certain characters are allowed in well formed element and attribute
379         * names in xml. For example, white space is not allowed in a name.</p>
380         *
381         * <p>The code for this method is based on code in 
382         * <code>org.apache.xerces.util.XMLChar</code> 
383         * in <a href='http://xerces.apache.org/xerces2-j/index.html'>Apache Xerces</a>.
384         * The authors of this class are credited at the top of this class.</p>
385         *
386         * @param name the <code>String</code> to be checked for use as an xml attribute 
387         * or element name. Returns false if <code>name</code> is null
388         * @return true if this string would be a well-formed name
389         */
390        public static boolean isWellFormedXMLName( String name ) {
391            if ( name == null ) {
392                return false;
393            }
394            
395            if ( name.length() == 0 ) {
396                return false;
397            }
398            
399            char ch = name.charAt(0);
400            if( isNameStartChar(ch) == false) {
401               return false;
402               
403            }
404            
405            for (int i = 1; i < name.length(); i++ ) {
406               ch = name.charAt(i);
407               if( isNameChar( ch ) == false ) {
408                  return false;
409               }
410            }
411            return true;
412        }
413    
414        /**
415         * Returns true if the specified character is a valid name
416         * character as defined by the XML 1.0 specification.
417         *
418         * @param c The character to check.
419         * @return true if this is an XML name character
420         */
421        public static boolean isNameChar(int c) {
422            return c < 0x10000 && (CHARS[c] & MASK_NAME) != 0;
423        }
424        
425        /**
426         * Returns true if the specified character is a valid name start
427         * character as defined in the XML 1.0 specification.
428         *
429         * @param c The character to check.
430         * @return trus if this is an XML name start character
431         */
432        public static boolean isNameStartChar(int c) {
433            return c < 0x10000 && (CHARS[c] & MASK_NAME_START) != 0;
434        }
435    }