001    // Copyright 2004, 2005 The Apache Software Foundation
002    //
003    // Licensed under the Apache License, Version 2.0 (the "License");
004    // you may not use this file except in compliance with the License.
005    // You may obtain a copy of the License at
006    //
007    //     http://www.apache.org/licenses/LICENSE-2.0
008    //
009    // Unless required by applicable law or agreed to in writing, software
010    // distributed under the License is distributed on an "AS IS" BASIS,
011    // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
012    // See the License for the specific language governing permissions and
013    // limitations under the License.
014    
015    package org.apache.tapestry.util.text;
016    
017    /**
018     * An object that encodes a character according to rules of the HTML specification, 
019     * so that it will be properly parsed by a browser irrespectively of the character
020     * encoding used in the HTML output.
021     * 
022     * @author mb
023     * @since 4.0
024     */
025    public class MarkupCharacterTranslator implements ICharacterTranslator
026    {
027        private static final String SAFE_CHARACTERS =
028            "01234567890"
029                + "abcdefghijklmnopqrstuvwxyz"
030                + "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
031                + "\t\n\r !#$%'()*+,-./:;=?@[\\]^_`{|}~";
032    
033        private static final String[][] ENTITIES = {
034            { "\"", """ }, 
035                    { "<", "<" },
036                    { ">", ">" },
037                    { "&", "&" }
038        };
039        
040        private static final ICharacterMatcher SAFE_MATCHER = new AsciiCharacterMatcher(SAFE_CHARACTERS);
041        private static final ICharacterTranslator ENTITY_TRANSLATOR = new AsciiCharacterTranslator(ENTITIES);
042        
043        private boolean _encodeNonAscii;
044        private ICharacterMatcher _safeMatcher;
045        private ICharacterTranslator _entityTranslator;
046            
047        public MarkupCharacterTranslator()
048        {
049            this(true);
050        }
051        
052        public MarkupCharacterTranslator(boolean encodeNonAscii)
053        {
054            this(encodeNonAscii, SAFE_MATCHER, ENTITY_TRANSLATOR);
055        }
056        
057        public MarkupCharacterTranslator(boolean encodeNonAscii, ICharacterMatcher safeMatcher, ICharacterTranslator entityTranslator)
058        {
059            _encodeNonAscii = encodeNonAscii;
060            _safeMatcher = safeMatcher;
061            _entityTranslator = entityTranslator;
062        }
063    
064        public MarkupCharacterTranslator(boolean encodeNonAscii, String safeCharacters, String[][] entities)
065        {
066            _encodeNonAscii = encodeNonAscii;
067            _safeMatcher = new AsciiCharacterMatcher(safeCharacters);
068            _entityTranslator = new AsciiCharacterTranslator(entities);
069        }
070        
071            /**
072             * @see org.apache.tapestry.util.text.IMarkupCharacterTranslator#translateAttribute(char)
073             */
074            public String translate(char ch) {
075                    // IE and Firefox do not handle characters between 128 and 159 well, 
076                    // so they have to be quoted as well 
077                    if (ch >= 160 && !_encodeNonAscii) 
078                            return null;
079                    
080                    if (_safeMatcher.matches(ch))
081                            return null;
082    
083                    String entity = _entityTranslator.translate(ch);
084                    if (entity != null)
085                            return entity;
086                    
087                    // needs to use a NumberFormat here to be fully compliant, 
088                    // but this is accepted fine by the browsers
089                    return "&#" + (int) ch + ";";
090            }
091    }