001/* 002 * Copyright © 2025 CUI-OpenSource-Software (info@cuioss.de) 003 * 004 * Licensed under the Apache License, Version 2.0 (the "License"); 005 * you may not use this file except in compliance with the License. 006 * You may obtain a copy of the License at 007 * 008 * http://www.apache.org/licenses/LICENSE-2.0 009 * 010 * Unless required by applicable law or agreed to in writing, software 011 * distributed under the License is distributed on an "AS IS" BASIS, 012 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 013 * See the License for the specific language governing permissions and 014 * limitations under the License. 015 */ 016package de.cuioss.http.security.validation; 017 018import de.cuioss.http.security.core.ValidationType; 019 020import java.util.BitSet; 021 022/** 023 * RFC-compliant character set definitions for HTTP component validation. 024 * 025 * <p>This utility class provides pre-computed BitSet instances containing allowed characters 026 * for different HTTP components according to RFC 3986 (URI) and RFC 7230 (HTTP) specifications. 027 * All character sets are optimized for high-performance validation with O(1) character lookups.</p> 028 * 029 * <h3>Design Principles</h3> 030 * <ul> 031 * <li><strong>RFC Compliance</strong> - Strict adherence to HTTP and URI specifications</li> 032 * <li><strong>Performance Optimized</strong> - Pre-computed BitSets for O(1) character validation</li> 033 * <li><strong>Thread Safety</strong> - Immutable after initialization, safe for concurrent access</li> 034 * <li><strong>Memory Efficient</strong> - Shared instances reduce memory overhead</li> 035 * </ul> 036 * 037 * <h3>Character Set Categories</h3> 038 * <ul> 039 * <li><strong>RFC3986_UNRESERVED</strong> - Basic unreserved characters from RFC 3986</li> 040 * <li><strong>RFC3986_PATH_CHARS</strong> - Characters allowed in URL paths</li> 041 * <li><strong>RFC3986_QUERY_CHARS</strong> - Characters allowed in URL query parameters</li> 042 * <li><strong>RFC7230_HEADER_CHARS</strong> - Characters allowed in HTTP headers</li> 043 * <li><strong>HTTP_BODY_CHARS</strong> - Characters allowed in HTTP request/response bodies</li> 044 * </ul> 045 * 046 * <h3>Usage Examples</h3> 047 * <pre> 048 * // Get character set for URL path validation 049 * BitSet pathChars = CharacterValidationConstants.getCharacterSet(ValidationType.URL_PATH); 050 * 051 * // Check if character is allowed in URL paths 052 * char ch = '/'; 053 * boolean isAllowed = pathChars.get(ch); // Returns true 054 * 055 * // Validate string characters 056 * String input = "/api/users"; 057 * for (int i = 0; i < input.length(); i++) { 058 * char c = input.charAt(i); 059 * if (!pathChars.get(c)) { 060 * throw new IllegalArgumentException("Invalid character: " + c); 061 * } 062 * } 063 * </pre> 064 * 065 * <h3>Performance Characteristics</h3> 066 * <ul> 067 * <li>O(1) character lookup time using BitSet.get()</li> 068 * <li>Minimal memory footprint - shared across all validators</li> 069 * <li>No runtime computation - all sets pre-computed during class loading</li> 070 * <li>Thread-safe concurrent access without synchronization</li> 071 * </ul> 072 * 073 * <h3>RFC References</h3> 074 * <ul> 075 * <li><strong>RFC 3986</strong> - Uniform Resource Identifier (URI) character definitions</li> 076 * <li><strong>RFC 7230</strong> - HTTP/1.1 Message Syntax and Routing header field definitions</li> 077 * </ul> 078 * 079 * <p><strong>Security Note:</strong> These character sets define <em>allowed</em> characters only. 080 * Additional security validation (pattern matching, length limits, etc.) should be applied 081 * by higher-level validation stages.</p> 082 * <p> 083 * Implements: Task V5 from HTTP verification specification 084 * 085 * @see ValidationType 086 * @see de.cuioss.http.security.validation.CharacterValidationStage 087 * @since 1.0 088 */ 089public final class CharacterValidationConstants { 090 091 private CharacterValidationConstants() { 092 // Utility class 093 } 094 095 /** 096 * RFC 3986 unreserved characters: ALPHA / DIGIT / "-" / "." / "_" / "~". 097 * <p>These are the basic safe characters allowed in URIs without percent-encoding.</p> 098 */ 099 public static final BitSet RFC3986_UNRESERVED; 100 101 /** 102 * RFC 3986 path characters including unreserved + path-specific characters. 103 * <p>Includes all unreserved characters plus: / @ : ! $ & ' ( ) * + , ; =</p> 104 */ 105 public static final BitSet RFC3986_PATH_CHARS; 106 107 /** 108 * RFC 3986 query characters including unreserved + query-specific characters. 109 * <p>Includes all unreserved characters plus: ? & = ! $ ' ( ) * + , ;</p> 110 */ 111 public static final BitSet RFC3986_QUERY_CHARS; 112 113 /** 114 * RFC 7230 header field characters (visible ASCII minus delimiters). 115 * <p>Includes space through tilde (32-126) plus tab character.</p> 116 */ 117 public static final BitSet RFC7230_HEADER_CHARS; 118 119 /** 120 * HTTP body content characters (permissive for JSON, XML, text, etc.). 121 * <p>Includes printable ASCII (32-126), tab, LF, CR, and extended ASCII (128-255).</p> 122 */ 123 public static final BitSet HTTP_BODY_CHARS; 124 125 static { 126 // Initialize RFC3986_UNRESERVED 127 BitSet unreserved = new BitSet(256); 128 // ALPHA 129 for (int i = 'A'; i <= 'Z'; i++) unreserved.set(i); 130 for (int i = 'a'; i <= 'z'; i++) unreserved.set(i); 131 // DIGIT 132 for (int i = '0'; i <= '9'; i++) unreserved.set(i); 133 // "-" / "." / "_" / "~" 134 unreserved.set('-'); 135 unreserved.set('.'); 136 unreserved.set('_'); 137 unreserved.set('~'); 138 RFC3986_UNRESERVED = unreserved; 139 140 // Initialize RFC3986_PATH_CHARS 141 BitSet pathChars = new BitSet(256); 142 pathChars.or(unreserved); // Include all unreserved chars 143 pathChars.set('/'); 144 pathChars.set('@'); 145 pathChars.set(':'); 146 // sub-delims for path: "!" / "$" / "&" / "'" / "(" / ")" / "*" / "+" / "," / ";" / "=" 147 "!$&'()*+,;=".chars().forEach(pathChars::set); 148 RFC3986_PATH_CHARS = pathChars; 149 150 // Initialize RFC3986_QUERY_CHARS 151 BitSet queryChars = new BitSet(256); 152 queryChars.or(unreserved); // Include all unreserved chars 153 queryChars.set('?'); 154 queryChars.set('&'); 155 queryChars.set('='); 156 // sub-delims for query 157 "!$'()*+,;".chars().forEach(queryChars::set); 158 RFC3986_QUERY_CHARS = queryChars; 159 160 // Initialize RFC7230_HEADER_CHARS 161 BitSet headerChars = new BitSet(256); 162 // RFC 7230: For header values, allow most visible ASCII plus space and tab 163 // Only exclude control chars and characters that could break HTTP parsing 164 for (int i = 32; i <= 126; i++) { // Include space (32) through tilde (126) 165 headerChars.set(i); 166 } 167 headerChars.set('\t'); // Tab is allowed in headers 168 // Only exclude characters that could break HTTP: CR, LF, NULL 169 // Note: Other dangerous chars are handled at application level 170 RFC7230_HEADER_CHARS = headerChars; 171 172 // Initialize HTTP_BODY_CHARS (very permissive for body content) 173 BitSet bodyChars = new BitSet(256); 174 // Allow all printable ASCII and extended characters 175 for (int i = 32; i <= 126; i++) { // ASCII printable characters 176 bodyChars.set(i); 177 } 178 // Allow common whitespace characters 179 bodyChars.set('\t'); // Tab (0x09) 180 bodyChars.set('\n'); // Line feed (0x0A) 181 bodyChars.set('\r'); // Carriage return (0x0D) 182 // Allow extended ASCII and Unicode range (128-255) 183 for (int i = 128; i <= 255; i++) { 184 bodyChars.set(i); 185 } 186 // Note: Null bytes and other control chars (1-31) are excluded by default 187 // They can be allowed via configuration if needed 188 HTTP_BODY_CHARS = bodyChars; 189 } 190 191 /** 192 * Returns the appropriate character set for the specified validation type. 193 * 194 * <p>This method provides a centralized mapping from validation types to their 195 * corresponding RFC-compliant character sets. The returned BitSet is the actual 196 * instance (not a copy) for performance reasons and must not be modified.</p> 197 * 198 * <h4>Validation Type Mappings:</h4> 199 * <ul> 200 * <li>{@code URL_PATH} → {@link #RFC3986_PATH_CHARS}</li> 201 * <li>{@code PARAMETER_NAME, PARAMETER_VALUE} → {@link #RFC3986_QUERY_CHARS}</li> 202 * <li>{@code HEADER_NAME, HEADER_VALUE} → {@link #RFC7230_HEADER_CHARS}</li> 203 * <li>{@code BODY} → {@link #HTTP_BODY_CHARS}</li> 204 * <li>{@code COOKIE_NAME, COOKIE_VALUE} → {@link #RFC3986_UNRESERVED}</li> 205 * </ul> 206 * 207 * @param type The validation type specifying which HTTP component is being validated 208 * @return The corresponding BitSet containing allowed characters for the validation type 209 * @throws NullPointerException if {@code type} is null 210 * @see ValidationType 211 * @see #RFC3986_PATH_CHARS 212 * @see #RFC3986_QUERY_CHARS 213 * @see #RFC7230_HEADER_CHARS 214 * @see #HTTP_BODY_CHARS 215 * @see #RFC3986_UNRESERVED 216 */ 217 public static BitSet getCharacterSet(ValidationType type) { 218 return switch (type) { 219 case URL_PATH -> RFC3986_PATH_CHARS; 220 case PARAMETER_NAME, PARAMETER_VALUE -> RFC3986_QUERY_CHARS; 221 case HEADER_NAME, HEADER_VALUE -> RFC7230_HEADER_CHARS; 222 case BODY -> HTTP_BODY_CHARS; 223 case COOKIE_NAME, COOKIE_VALUE -> RFC3986_UNRESERVED; 224 }; 225 } 226}