001/*
002 * Copyright © 2025 CUI-OpenSource-Software (info@cuioss.de)
003 *
004 * Licensed under the Apache License, Version 2.0 (the "License");
005 * you may not use this file except in compliance with the License.
006 * You may obtain a copy of the License at
007 *
008 *     http://www.apache.org/licenses/LICENSE-2.0
009 *
010 * Unless required by applicable law or agreed to in writing, software
011 * distributed under the License is distributed on an "AS IS" BASIS,
012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
013 * See the License for the specific language governing permissions and
014 * limitations under the License.
015 */
016package de.cuioss.http.security.validation;
017
018import de.cuioss.http.security.core.ValidationType;
019
020import java.util.BitSet;
021
022/**
023 * RFC-compliant character set definitions for HTTP component validation.
024 *
025 * <p>This utility class provides pre-computed BitSet instances containing allowed characters
026 * for different HTTP components according to RFC 3986 (URI) and RFC 7230 (HTTP) specifications.
027 * All character sets are optimized for high-performance validation with O(1) character lookups.</p>
028 *
029 * <h3>Design Principles</h3>
030 * <ul>
031 *   <li><strong>RFC Compliance</strong> - Strict adherence to HTTP and URI specifications</li>
032 *   <li><strong>Performance Optimized</strong> - Pre-computed BitSets for O(1) character validation</li>
033 *   <li><strong>Thread Safety</strong> - Immutable after initialization, safe for concurrent access</li>
034 *   <li><strong>Memory Efficient</strong> - Shared instances reduce memory overhead</li>
035 * </ul>
036 *
037 * <h3>Character Set Categories</h3>
038 * <ul>
039 *   <li><strong>RFC3986_UNRESERVED</strong> - Basic unreserved characters from RFC 3986</li>
040 *   <li><strong>RFC3986_PATH_CHARS</strong> - Characters allowed in URL paths</li>
041 *   <li><strong>RFC3986_QUERY_CHARS</strong> - Characters allowed in URL query parameters</li>
042 *   <li><strong>RFC7230_HEADER_CHARS</strong> - Characters allowed in HTTP headers</li>
043 *   <li><strong>HTTP_BODY_CHARS</strong> - Characters allowed in HTTP request/response bodies</li>
044 * </ul>
045 *
046 * <h3>Usage Examples</h3>
047 * <pre>
048 * // Get character set for URL path validation
049 * BitSet pathChars = CharacterValidationConstants.getCharacterSet(ValidationType.URL_PATH);
050 *
051 * // Check if character is allowed in URL paths
052 * char ch = '/';
053 * boolean isAllowed = pathChars.get(ch); // Returns true
054 *
055 * // Validate string characters
056 * String input = "/api/users";
057 * for (int i = 0; i &lt; input.length(); i++) {
058 *     char c = input.charAt(i);
059 *     if (!pathChars.get(c)) {
060 *         throw new IllegalArgumentException("Invalid character: " + c);
061 *     }
062 * }
063 * </pre>
064 *
065 * <h3>Performance Characteristics</h3>
066 * <ul>
067 *   <li>O(1) character lookup time using BitSet.get()</li>
068 *   <li>Minimal memory footprint - shared across all validators</li>
069 *   <li>No runtime computation - all sets pre-computed during class loading</li>
070 *   <li>Thread-safe concurrent access without synchronization</li>
071 * </ul>
072 *
073 * <h3>RFC References</h3>
074 * <ul>
075 *   <li><strong>RFC 3986</strong> - Uniform Resource Identifier (URI) character definitions</li>
076 *   <li><strong>RFC 7230</strong> - HTTP/1.1 Message Syntax and Routing header field definitions</li>
077 * </ul>
078 *
079 * <p><strong>Security Note:</strong> These character sets define <em>allowed</em> characters only.
080 * Additional security validation (pattern matching, length limits, etc.) should be applied
081 * by higher-level validation stages.</p>
082 * <p>
083 * Implements: Task V5 from HTTP verification specification
084 *
085 * @see ValidationType
086 * @see de.cuioss.http.security.validation.CharacterValidationStage
087 * @since 1.0
088 */
089public final class CharacterValidationConstants {
090
091    private CharacterValidationConstants() {
092        // Utility class
093    }
094
095    /**
096     * RFC 3986 unreserved characters: ALPHA / DIGIT / "-" / "." / "_" / "~".
097     * <p>These are the basic safe characters allowed in URIs without percent-encoding.</p>
098     */
099    public static final BitSet RFC3986_UNRESERVED;
100
101    /**
102     * RFC 3986 path characters including unreserved + path-specific characters.
103     * <p>Includes all unreserved characters plus: / @ : ! $ &amp; ' ( ) * + , ; =</p>
104     */
105    public static final BitSet RFC3986_PATH_CHARS;
106
107    /**
108     * RFC 3986 query characters including unreserved + query-specific characters.
109     * <p>Includes all unreserved characters plus: ? &amp; = ! $ ' ( ) * + , ;</p>
110     */
111    public static final BitSet RFC3986_QUERY_CHARS;
112
113    /**
114     * RFC 7230 header field characters (visible ASCII minus delimiters).
115     * <p>Includes space through tilde (32-126) plus tab character.</p>
116     */
117    public static final BitSet RFC7230_HEADER_CHARS;
118
119    /**
120     * HTTP body content characters (permissive for JSON, XML, text, etc.).
121     * <p>Includes printable ASCII (32-126), tab, LF, CR, and extended ASCII (128-255).</p>
122     */
123    public static final BitSet HTTP_BODY_CHARS;
124
125    static {
126        // Initialize RFC3986_UNRESERVED
127        BitSet unreserved = new BitSet(256);
128        // ALPHA
129        for (int i = 'A'; i <= 'Z'; i++) unreserved.set(i);
130        for (int i = 'a'; i <= 'z'; i++) unreserved.set(i);
131        // DIGIT
132        for (int i = '0'; i <= '9'; i++) unreserved.set(i);
133        // "-" / "." / "_" / "~"
134        unreserved.set('-');
135        unreserved.set('.');
136        unreserved.set('_');
137        unreserved.set('~');
138        RFC3986_UNRESERVED = unreserved;
139
140        // Initialize RFC3986_PATH_CHARS
141        BitSet pathChars = new BitSet(256);
142        pathChars.or(unreserved);  // Include all unreserved chars
143        pathChars.set('/');
144        pathChars.set('@');
145        pathChars.set(':');
146        // sub-delims for path: "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "="
147        "!$&'()*+,;=".chars().forEach(pathChars::set);
148        RFC3986_PATH_CHARS = pathChars;
149
150        // Initialize RFC3986_QUERY_CHARS
151        BitSet queryChars = new BitSet(256);
152        queryChars.or(unreserved);  // Include all unreserved chars
153        queryChars.set('?');
154        queryChars.set('&');
155        queryChars.set('=');
156        // sub-delims for query
157        "!$'()*+,;".chars().forEach(queryChars::set);
158        RFC3986_QUERY_CHARS = queryChars;
159
160        // Initialize RFC7230_HEADER_CHARS
161        BitSet headerChars = new BitSet(256);
162        // RFC 7230: For header values, allow most visible ASCII plus space and tab
163        // Only exclude control chars and characters that could break HTTP parsing
164        for (int i = 32; i <= 126; i++) { // Include space (32) through tilde (126)
165            headerChars.set(i);
166        }
167        headerChars.set('\t'); // Tab is allowed in headers
168        // Only exclude characters that could break HTTP: CR, LF, NULL
169        // Note: Other dangerous chars are handled at application level
170        RFC7230_HEADER_CHARS = headerChars;
171
172        // Initialize HTTP_BODY_CHARS (very permissive for body content)
173        BitSet bodyChars = new BitSet(256);
174        // Allow all printable ASCII and extended characters
175        for (int i = 32; i <= 126; i++) { // ASCII printable characters
176            bodyChars.set(i);
177        }
178        // Allow common whitespace characters
179        bodyChars.set('\t');  // Tab (0x09)
180        bodyChars.set('\n');  // Line feed (0x0A)
181        bodyChars.set('\r');  // Carriage return (0x0D)
182        // Allow extended ASCII and Unicode range (128-255)
183        for (int i = 128; i <= 255; i++) {
184            bodyChars.set(i);
185        }
186        // Note: Null bytes and other control chars (1-31) are excluded by default
187        // They can be allowed via configuration if needed
188        HTTP_BODY_CHARS = bodyChars;
189    }
190
191    /**
192     * Returns the appropriate character set for the specified validation type.
193     *
194     * <p>This method provides a centralized mapping from validation types to their
195     * corresponding RFC-compliant character sets. The returned BitSet is the actual
196     * instance (not a copy) for performance reasons and must not be modified.</p>
197     *
198     * <h4>Validation Type Mappings:</h4>
199     * <ul>
200     *   <li>{@code URL_PATH} → {@link #RFC3986_PATH_CHARS}</li>
201     *   <li>{@code PARAMETER_NAME, PARAMETER_VALUE} → {@link #RFC3986_QUERY_CHARS}</li>
202     *   <li>{@code HEADER_NAME, HEADER_VALUE} → {@link #RFC7230_HEADER_CHARS}</li>
203     *   <li>{@code BODY} → {@link #HTTP_BODY_CHARS}</li>
204     *   <li>{@code COOKIE_NAME, COOKIE_VALUE} → {@link #RFC3986_UNRESERVED}</li>
205     * </ul>
206     *
207     * @param type The validation type specifying which HTTP component is being validated
208     * @return The corresponding BitSet containing allowed characters for the validation type
209     * @throws NullPointerException if {@code type} is null
210     * @see ValidationType
211     * @see #RFC3986_PATH_CHARS
212     * @see #RFC3986_QUERY_CHARS
213     * @see #RFC7230_HEADER_CHARS
214     * @see #HTTP_BODY_CHARS
215     * @see #RFC3986_UNRESERVED
216     */
217    public static BitSet getCharacterSet(ValidationType type) {
218        return switch (type) {
219            case URL_PATH -> RFC3986_PATH_CHARS;
220            case PARAMETER_NAME, PARAMETER_VALUE -> RFC3986_QUERY_CHARS;
221            case HEADER_NAME, HEADER_VALUE -> RFC7230_HEADER_CHARS;
222            case BODY -> HTTP_BODY_CHARS;
223            case COOKIE_NAME, COOKIE_VALUE -> RFC3986_UNRESERVED;
224        };
225    }
226}