1 | ----------------------------------------------------------------------------- |
---|---|
2 | -- |
3 | -- Copyright 2004 Björn Persson. |
4 | -- |
5 | -- This library is free software; you can redistribute it and/or modify it |
6 | -- under the terms of the GNU General Public License, version 2, as published |
7 | -- by the Free Software Foundation. |
8 | -- |
9 | -- As a special exception, if other files instantiate generics from this |
10 | -- unit, or you link this unit with other files to produce an executable, |
11 | -- this unit does not by itself cause the resulting executable to be covered |
12 | -- by the General Public License. This exception does not however invalidate |
13 | -- any other reasons why the executable file might be covered by the General |
14 | -- Public License. |
15 | -- |
16 | -- This package was based on Gnat's implementation of Ada.Strings.Unbounded, |
17 | -- whose specification was in turn derived from the Ada Reference Manual. The |
18 | -- copyright of Gnat belongs to the Free Software Foundation. |
19 | -- |
20 | ---------------------------------------------------------------------------- |
21 | |
22 | pragma License (Modified_Gpl); |
23 | pragma Ada_2022; |
24 | |
25 | with Ada.Strings.Unbounded; |
26 | with Ada.Finalization; |
27 | with Ada.Strings.Maps; |
28 | with Interfaces; |
29 | with System; |
30 | with AdaCL.OS.Low_Level; |
31 | |
32 | package AdaCL.EAstrings is |
33 | -- |
34 | -- EAstring - encoding-aware string - is a dynamic string type similar to |
35 | -- Ada.Strings.Unbounded.Unbounded_String, but it handles strings in all |
36 | -- character encodings and frees you from character encoding troubles. It |
37 | -- keeps track of how each string is encoded and transcodes them |
38 | -- automatically when necessary. |
39 | -- |
40 | -- The child packages Latin_1 and UCS_2 contain subprograms for using |
41 | -- EAstrings together with Ada's standard string types. Latin_1 is for |
42 | -- String and Unbounded_String, and UCS_2 is for Wide_String and |
43 | -- Unbounded_Wide_String. |
44 | -- |
45 | -- The child package IO provides encoding-aware input and output. |
46 | -- |
47 | -- The child package OS and its descendants provide encoding-aware access |
48 | -- to information from the environment. |
49 | -- |
50 | -- The procedures and functions that manipulate EAstrings will transcode |
51 | -- them as needed. The procedures may change the encoding of their |
52 | -- parameters when these are declared "in out". The functions will leave |
53 | -- their parameters unchanged but may return EAstrings in other encodings. |
54 | -- There are also subprograms available that allow for explicit |
55 | -- transcoding (Transcode and Convert). In most cases it should not be |
56 | -- necessary to use them, but if you need to convert an EAstring to a byte |
57 | -- sequence in a particular encoding you should call Transcode to make |
58 | -- sure that it is in the right encoding, and then retrieve the byte |
59 | -- sequence with Bytes. You may also want to call Transcode for |
60 | -- performance reasons in certain cases. A series of function calls may |
61 | -- lead to the same string being transcoded over and over, and this can be |
62 | -- prevented by explicitly transcoding it first. |
63 | -- |
64 | -- The first character in an EAstring always has the index 1. |
65 | -- |
66 | -- In this package, a character is a Unicode code point; not a byte, a |
67 | -- code unit or a grapheme. In some encodings, a code unit is one byte. In |
68 | -- others it's two or four bytes. In some encodings, one character (code |
69 | -- point) can be encoded in several code units. A grapheme can be |
70 | -- represented as one character or as a base character followed by one or |
71 | -- more combining characters. Most of the subprograms in this package |
72 | -- count, compare and process code points. A few deal with bytes. |
73 | -- |
74 | -- Througout the EAstrings library, it is assumed that a byte is eight |
75 | -- bits. |
76 | -- |
77 | |
78 | ------------------------- |
79 | -- types and constants -- |
80 | ------------------------- |
81 | |
82 | use Ada.Strings; |
83 | |
84 | type EAstring is private; |
85 | |
86 | type EAstring_Array is array (Integer range <>) of EAstring; |
87 | |
88 | type Byte_Sequence is |
89 | array (Positive range <>) of aliased Interfaces.Unsigned_8; |
90 | for Byte_Sequence'Component_Size use 8; |
91 | |
92 | type Converter is private; |
93 | |
94 | type Conversion_Stop_Cause is ( |
95 | All_Done, |
96 | Target_Full, |
97 | Incomplete, |
98 | Inconvertible); |
99 | -- Reason why Convert returns. |
100 | -- All_Done : The whole input string was converted. |
101 | -- Target_Full : There isn't enough room in the target. |
102 | -- Incomplete : The source ends in an incomplete code sequence. |
103 | -- Inconvertible : One of these errors happened: |
104 | -- 1: The source isn't a valid encoded string for the |
105 | -- source encoding. |
106 | -- 2: The source contains a character that can't be |
107 | -- represented in the target encoding. |
108 | |
109 | Null_EAstring : constant EAstring; |
110 | |
111 | Null_Converter : constant Converter; |
112 | |
113 | ---------------------------------------------------------- |
114 | -- Conversion, Concatenation, and Selection Subprograms -- |
115 | ---------------------------------------------------------- |
116 | |
117 | function Length (Source : EAstring) return Natural; |
118 | -- Returns the number of characters in the string. (This is not the same |
119 | -- as the space it will take when printed, even in a fixed-width font.) |
120 | -- This may be a more expensive operation than one might think, because if |
121 | -- the string is in a variable-width encoding, a copy of it must be |
122 | -- converted to a fixed-width encoding before the characters can be |
123 | -- counted. |
124 | |
125 | function To_EAstring (Length : in Natural) return EAstring; |
126 | -- Returns a string of Length spaces. |
127 | |
128 | procedure Append (Source : in out EAstring; New_Item : in EAstring); |
129 | -- Appends New_Item to the end of Source. |
130 | |
131 | function "&" (Left, Right : in EAstring) return EAstring; |
132 | -- Concatenates two EAstrings. |
133 | |
134 | function Slice |
135 | (Source : in EAstring; |
136 | Low : in Positive; |
137 | High : in Natural) |
138 | return EAstring; |
139 | -- Slice returns the characters Low through High in Source. It returns |
140 | -- Null_EAstring if Low > High, and raises Index_Error if Low <= High and |
141 | -- High > Length(Source). |
142 | |
143 | overriding function "=" (Left, Right : in EAstring) return Boolean; |
144 | -- Returns True if and only if Left and Right contain identical sequences |
145 | -- of characters (regardless of their current encodings). |
146 | |
147 | -- function "<"(Left, Right : in EAstring) return Boolean; |
148 | -- |
149 | -- function "<" |
150 | -- (Left : in EAstring; |
151 | -- Right : in String) |
152 | -- return Boolean; |
153 | -- |
154 | -- function "<" |
155 | -- (Left : in String; |
156 | -- Right : in EAstring) |
157 | -- return Boolean; |
158 | -- |
159 | -- function "<="(Left, Right : in EAstring) return Boolean; |
160 | -- |
161 | -- function "<=" |
162 | -- (Left : in EAstring; |
163 | -- Right : in String) |
164 | -- return Boolean; |
165 | -- |
166 | -- function "<=" |
167 | -- (Left : in String; |
168 | -- Right : in EAstring) |
169 | -- return Boolean; |
170 | -- |
171 | -- function ">"(Left, Right : in EAstring) return Boolean; |
172 | -- |
173 | -- function ">" |
174 | -- (Left : in EAstring; |
175 | -- Right : in String) |
176 | -- return Boolean; |
177 | -- |
178 | -- function ">" |
179 | -- (Left : in String; |
180 | -- Right : in EAstring) |
181 | -- return Boolean; |
182 | -- |
183 | -- function ">="(Left, Right : in EAstring) return Boolean; |
184 | -- |
185 | -- function ">=" |
186 | -- (Left : in EAstring; |
187 | -- Right : in String) |
188 | -- return Boolean; |
189 | -- |
190 | -- function ">=" |
191 | -- (Left : in String; |
192 | -- Right : in EAstring) |
193 | -- return Boolean; |
194 | |
195 | ------------------------ |
196 | -- Search Subprograms -- |
197 | ------------------------ |
198 | |
199 | function Index |
200 | (Source : in EAstring; |
201 | Pattern : in EAstring; |
202 | Going : in Direction := Forward) |
203 | -- Mapping : in Maps.Character_Mapping := Maps.Identity) to be added |
204 | -- later |
205 | return Natural; |
206 | -- Searches for a slice of Source, with the same length as Pattern, that |
207 | -- matches Pattern with respect to Mapping. The parameter Going indicates |
208 | -- the direction of the lookup. Index returns the smallest (if Going = |
209 | -- Forward) or largest (if Going = Backward) index I such that the slice |
210 | -- of Source starting at I matches Pattern. If there is no such slice, |
211 | -- then 0 is returned. If Pattern is the null string then Pattern_Error is |
212 | -- propagated. |
213 | |
214 | -- function Index |
215 | -- (Source : in EAstring; |
216 | -- Pattern : in String; |
217 | -- Going : in Direction := Forward; |
218 | -- Mapping : in Maps.Character_Mapping_Function) |
219 | -- return Natural; |
220 | -- |
221 | -- function Index |
222 | -- (Source : in EAstring; |
223 | -- Set : in Maps.Character_Set; |
224 | -- Test : in Membership := Inside; |
225 | -- Going : in Direction := Forward) |
226 | -- return Natural; |
227 | |
228 | function Index_Non_Blank |
229 | (Source : in EAstring; |
230 | Going : in Direction := Forward) |
231 | return Natural; |
232 | -- Returns the index of the first (if Going = Forward) or last (if Going = |
233 | -- Backward) character in Source that is not a space (U+0020). If there is |
234 | -- no such character, then 0 is returned. |
235 | |
236 | function Count |
237 | (Source : in EAstring; |
238 | Pattern : in EAstring) |
239 | -- Mapping : in Maps.Character_Mapping := Maps.Identity) to be added |
240 | -- later |
241 | return Natural; |
242 | -- Returns the maximum number of nonoverlapping slices of Source that |
243 | -- match Pattern with respect to Mapping. If Pattern is the null string |
244 | -- then Pattern_Error is propagated. |
245 | |
246 | -- function Count |
247 | -- (Source : in EAstring; |
248 | -- Pattern : in String; |
249 | -- Mapping : in Maps.Character_Mapping_Function) |
250 | -- return Natural; |
251 | -- |
252 | -- function Count |
253 | -- (Source : in EAstring; |
254 | -- Set : in Maps.Character_Set) |
255 | -- return Natural; |
256 | -- |
257 | -- procedure Find_Token |
258 | -- (Source : in EAstring; |
259 | -- Set : in Maps.Character_Set; |
260 | -- Test : in Membership; |
261 | -- First : out Positive; |
262 | -- Last : out Natural); |
263 | -- |
264 | -- ------------------------------------ |
265 | -- -- String Translation Subprograms -- |
266 | -- ------------------------------------ |
267 | -- |
268 | -- function Translate |
269 | -- (Source : in EAstring; |
270 | -- Mapping : in Maps.Character_Mapping) |
271 | -- return EAstring; |
272 | -- |
273 | -- procedure Translate |
274 | -- (Source : in out EAstring; |
275 | -- Mapping : Maps.Character_Mapping); |
276 | -- |
277 | -- function Translate |
278 | -- (Source : in EAstring; |
279 | -- Mapping : in Maps.Character_Mapping_Function) |
280 | -- return EAstring; |
281 | -- |
282 | -- procedure Translate |
283 | -- (Source : in out EAstring; |
284 | -- Mapping : in Maps.Character_Mapping_Function); |
285 | |
286 | --------------------------------------- |
287 | -- String Transformation Subprograms -- |
288 | --------------------------------------- |
289 | |
290 | function Replace_Slice |
291 | (Source : in EAstring; |
292 | Low : in Positive; |
293 | High : in Natural; |
294 | By : in EAstring) |
295 | return EAstring; |
296 | -- If Low > Length(Source)+1, then Index_Error is propagated. Otherwise: � |
297 | -- If High >= Low, then the returned string comprises |
298 | -- Slice(Source, 1, Low-1) & By & Slice(Source, High+1, Length(Source)). |
299 | -- � If High < Low, then the returned string is Insert(Source, Low, By). |
300 | |
301 | procedure Replace_Slice |
302 | (Source : in out EAstring; |
303 | Low : in Positive; |
304 | High : in Natural; |
305 | By : in EAstring); |
306 | -- Like above, but changes Source instead of returning a new string. |
307 | |
308 | function Insert |
309 | (Source : in EAstring; |
310 | Before : in Positive; |
311 | New_Item : in EAstring) |
312 | return EAstring; |
313 | -- Propagates Index_Error if Before > Length(Source)+1; otherwise returns |
314 | -- Slice(Source, 1, Before-1) & New_Item & Slice(Source, Before, |
315 | -- Length(Source)). |
316 | |
317 | procedure Insert |
318 | (Source : in out EAstring; |
319 | Before : in Positive; |
320 | New_Item : in EAstring); |
321 | -- Like above, but changes Source instead of returning a new string. |
322 | |
323 | function Overwrite |
324 | (Source : in EAstring; |
325 | Position : in Positive; |
326 | New_Item : in EAstring) |
327 | return EAstring; |
328 | -- Propagates Index_Error if Position > Length(Source)+1; otherwise |
329 | -- returns the string obtained from Source by consecutively replacing |
330 | -- characters starting at Position with corresponding characters from |
331 | -- New_Item. If the end of Source is reached before the characters in |
332 | -- New_Item are exhausted, the remaining characters from New_Item are |
333 | -- appended to the string. |
334 | |
335 | procedure Overwrite |
336 | (Source : in out EAstring; |
337 | Position : in Positive; |
338 | New_Item : in EAstring); |
339 | -- Like above, but changes Source instead of returning a new string. |
340 | |
341 | function Delete |
342 | (Source : in EAstring; |
343 | From : in Positive; |
344 | Through : in Natural) |
345 | return EAstring; |
346 | -- Returns Replace_Slice(Source, From, Through, Null_EAstring). |
347 | |
348 | procedure Delete |
349 | (Source : in out EAstring; |
350 | From : in Positive; |
351 | Through : in Natural); |
352 | -- Like above, but changes Source instead of returning a new string. |
353 | |
354 | function Trim |
355 | (Source : in EAstring; |
356 | Side : in Trim_End) |
357 | return EAstring; |
358 | -- Returns the EAstring obtained by removing from Source all leading (if |
359 | -- Side = Left), trailing (if Side = Right), or leading and trailing (if |
360 | -- Side = Both) space characters (U+0020). |
361 | |
362 | procedure Trim (Source : in out EAstring; Side : in Trim_End); |
363 | -- Like above, but changes Source instead of returning a new string. |
364 | |
365 | -- function Trim |
366 | -- (Source : in EAstring; |
367 | -- Left : in Maps.Character_Set; |
368 | -- Right : in Maps.Character_Set) |
369 | -- return EAstring; |
370 | -- |
371 | -- procedure Trim |
372 | -- (Source : in out EAstring; |
373 | -- Left : in Maps.Character_Set; |
374 | -- Right : in Maps.Character_Set); |
375 | |
376 | function Head |
377 | (Source : in EAstring; |
378 | Count : in Natural) |
379 | -- : in Unicode_Character := Space" to be added later. |
380 | return EAstring; |
381 | -- Returns an EAstring of length Count. If Count <= Length(Source), the |
382 | -- string comprises the first Count characters of Source. Otherwise its |
383 | -- contents are Source concatenated with Count - Length(Source) spaces. |
384 | |
385 | procedure Head (Source : in out EAstring; Count : in Natural); |
386 | -- "Pad : in Unicode_Character := Space" to be added later. |
387 | -- Like above, but changes Source instead of returning a new string. |
388 | |
389 | function Tail |
390 | (Source : in EAstring; |
391 | Count : in Natural) |
392 | -- : in Unicode_Character := Space" to be added later. |
393 | return EAstring; |
394 | -- Returns an EAstring of length Count. If Count <= Length(Source), the |
395 | -- string comprises the last Count characters of Source. Otherwise its |
396 | -- contents are Count - Length(Source) spaces concatenated with Source. |
397 | |
398 | procedure Tail (Source : in out EAstring; Count : in Natural); |
399 | -- "Pad : in Unicode_Character := Space" to be added later. |
400 | -- Like above, but changes Source instead of returning a new string. |
401 | |
402 | function "*" (Left : in Natural; Right : in EAstring) return EAstring; |
403 | -- Returns an EAstring that comprises Left concatenated copies of Right. |
404 | |
405 | --------------------------------------------------------------------------- |
406 | -- package Character_Encodings |
407 | --------------------------------------------------------------------------- |
408 | |
409 | package Character_Encodings is |
410 | |
411 | use Ada.Strings.Unbounded; |
412 | |
413 | type Character_Encoding (Known : Boolean := False) is private; |
414 | |
415 | subtype Known_Character_Encoding is Character_Encoding (True); |
416 | |
417 | -- Note: If Gnat crashes on a declaration of a Character_Encoding |
418 | -- variable, try initializing the variable at the declaration. (This |
419 | -- behaviour was observed with GCC-Gnat 3.4.0.) |
420 | |
421 | subtype Character_Width is Positive range 1 .. 4; |
422 | |
423 | UCS_4_BE, UCS_4_LE, UCS_4, UCS_2_BE, UCS_2_LE, UCS_2, UTF_16_BE, |
424 | UTF_16_LE, UTF_16, UTF_8, US_ASCII, Latin_1 : constant |
425 | Known_Character_Encoding; |
426 | -- These are the encodings that this package knows some characteristics |
427 | -- of. They are therefore handled more efficiently than other encodings |
428 | -- in some situations. "BE" and "LE" means big-endian and |
429 | -- little-endian. UCS_4, UCS_2 and UTF_16 will have the machine's |
430 | -- native byte order. |
431 | |
432 | overriding function "=" (Left, Right : Character_Encoding) return Boolean; |
433 | -- Redefining "=" shouldn't be necessary, but it works around a bug in |
434 | -- GCC-Gnat 3.4.0. |
435 | |
436 | function Name |
437 | (Encoding : Character_Encoding) |
438 | return Unbounded_String; |
439 | |
440 | function Encoding_By_Name (Name : String) return Character_Encoding; |
441 | |
442 | function Fixed_Width (Encoding : Character_Encoding) return Boolean; |
443 | -- Returns true if Encoding is a Known_Character_Encoding and uses the |
444 | -- same number of bytes for all characters, false otherwise. |
445 | |
446 | function Width |
447 | (Encoding : Known_Character_Encoding) |
448 | return Character_Width; |
449 | -- Returns the number of bytes used for one character in this encoding. |
450 | -- Propagates Constraint_Error if the character width is variable. |
451 | |
452 | function ASCII_Compatible |
453 | (Encoding : Character_Encoding) |
454 | return Boolean; |
455 | -- Returns true if and only if Encoding is a Known_Character_Encoding |
456 | -- and is compatible with ASCII so that all the characters in ASCII are |
457 | -- encoded with the same numbers in this encoding as in ASCII, and any |
458 | -- valid ASCII text is also a valid text in this encoding. |
459 | |
460 | function Subset_Of_UCS_2 |
461 | (Encoding : Character_Encoding) |
462 | return Boolean; |
463 | -- Returns true if and only if Encoding is a Known_Character_Encoding |
464 | -- and all the characters that can be represented in this encoding can |
465 | -- also be represented in UCS-2 (but mostly with different numbers). |
466 | -- (Note that UCS-2 itself fits the criteria.) |
467 | |
468 | ------------------------------------------------------------------------ |
469 | ----- |
470 | private |
471 | ------------------------------------------------------------------------ |
472 | ----- |
473 | |
474 | use AdaCL.OS.Low_Level; |
475 | use Interfaces; |
476 | use Ada.Strings.Maps; |
477 | |
478 | type Encoding_Data is record |
479 | Name : Unbounded_String; |
480 | Windows_Number : Unsigned_16; |
481 | Width : Natural range 0 .. 4; -- 0 for variable |
482 | ASCII_Compatible : Boolean; |
483 | Subset_Of_UCS_2 : Boolean; |
484 | end record; |
485 | |
486 | type Encoding_ID is ( |
487 | UCS_4_BE_ID, |
488 | UCS_4_LE_ID, |
489 | UCS_2_BE_ID, |
490 | UCS_2_LE_ID, |
491 | UTF_16_BE_ID, |
492 | UTF_16_LE_ID, |
493 | UTF_8_ID, |
494 | ASCII_ID, |
495 | Latin_1_ID, |
496 | CP_850_ID, |
497 | Windows_1252_ID); |
498 | |
499 | Encoding_Info : constant array (Encoding_ID) of Encoding_Data := |
500 | [(To_Unbounded_String ("UCS-4BE"), 12001, 4, False, False), |
501 | (To_Unbounded_String ("UCS-4LE"), 12000, 4, False, False), |
502 | (To_Unbounded_String ("UCS-2BE"), 1201, 2, False, True), |
503 | (To_Unbounded_String ("UCS-2LE"), 1200, 2, False, True), |
504 | (To_Unbounded_String ("UTF-16BE"), 0, 0, False, False), |
505 | (To_Unbounded_String ("UTF-16LE"), 0, 0, False, False), |
506 | (To_Unbounded_String ("UTF-8"), 65001, 0, True, False), |
507 | (To_Unbounded_String ("US-ASCII"), 20127, 1, True, True), |
508 | (To_Unbounded_String ("ISO-8859-1"), 28591, 1, True, True), |
509 | (To_Unbounded_String ("IBM850"), 850, 1, True, True), |
510 | (To_Unbounded_String ("WINDOWS-1252"), 1252, 1, True, True)]; |
511 | |
512 | type Encoding_Alias is record |
513 | Name : Unbounded_String; |
514 | ID : Encoding_ID; |
515 | end record; |
516 | |
517 | Encoding_Aliases : constant |
518 | array (Positive range <>) of Encoding_Alias := |
519 | [(To_Unbounded_String ("ANSI-X3.4-1968"), ASCII_ID), |
520 | (To_Unbounded_String ("ASCII"), ASCII_ID), |
521 | (To_Unbounded_String ("CP-850"), CP_850_ID), |
522 | (To_Unbounded_String ("CP850"), CP_850_ID), |
523 | (To_Unbounded_String ("IBM850"), CP_850_ID), |
524 | (To_Unbounded_String ("ISO-8859-1"), Latin_1_ID), |
525 | (To_Unbounded_String ("LATIN-1"), Latin_1_ID), |
526 | (To_Unbounded_String ("LATIN1"), Latin_1_ID), |
527 | (To_Unbounded_String ("UCS-2BE"), UCS_2_BE_ID), |
528 | (To_Unbounded_String ("UCS-2LE"), UCS_2_LE_ID), |
529 | (To_Unbounded_String ("UCS-4BE"), UCS_4_BE_ID), |
530 | (To_Unbounded_String ("UCS-4LE"), UCS_4_LE_ID), |
531 | (To_Unbounded_String ("US-ASCII"), ASCII_ID), |
532 | (To_Unbounded_String ("UTF-16BE"), UTF_16_BE_ID), |
533 | (To_Unbounded_String ("UTF-16LE"), UTF_16_LE_ID), |
534 | (To_Unbounded_String ("UTF-32BE"), UCS_4_BE_ID), |
535 | (To_Unbounded_String ("UTF-32LE"), UCS_4_LE_ID), |
536 | (To_Unbounded_String ("UTF-8"), UTF_8_ID), |
537 | (To_Unbounded_String ("WINDOWS-1252"), Windows_1252_ID)]; |
538 | |
539 | Normalization_Map : constant Character_Mapping := |
540 | To_Mapping |
541 | (From => "abcdefghijklmnopqrstuvwxyz_", |
542 | To => "ABCDEFGHIJKLMNOPQRSTUVWXYZ-"); |
543 | |
544 | type Unified_Encoding_Record (Known : Boolean; OS : Known_OS) is record |
545 | case Known is |
546 | when True => |
547 | Which : Encoding_ID; |
548 | when False => |
549 | case OS is |
550 | when Linux | MacOS => |
551 | Name : Unbounded_String; |
552 | -- Names of encodings are confined to ASCII. |
553 | when Windows => |
554 | Number : Unsigned_16; |
555 | end case; |
556 | end case; |
557 | end record; |
558 | |
559 | type Character_Encoding (Known : Boolean := False) is new |
560 | Unified_Encoding_Record (Known => Known, OS => This_OS); |
561 | |
562 | UCS_4_BE : constant Known_Character_Encoding := (True, UCS_4_BE_ID); |
563 | UCS_4_LE : constant Known_Character_Encoding := (True, UCS_4_LE_ID); |
564 | UCS_2_BE : constant Known_Character_Encoding := (True, UCS_2_BE_ID); |
565 | UCS_2_LE : constant Known_Character_Encoding := (True, UCS_2_LE_ID); |
566 | UTF_16_BE : constant Known_Character_Encoding := (True, UTF_16_BE_ID); |
567 | UTF_16_LE : constant Known_Character_Encoding := (True, UTF_16_LE_ID); |
568 | UTF_8 : constant Known_Character_Encoding := (True, UTF_8_ID); |
569 | US_ASCII : constant Known_Character_Encoding := (True, ASCII_ID); |
570 | Latin_1 : constant Known_Character_Encoding := (True, Latin_1_ID); |
571 | |
572 | -- Define UCS_4, UCS_2 and UTF_16 with the macine's native byte order. |
573 | |
574 | Endianness_Shift : constant array (System.Bit_Order) of Natural := |
575 | [System.High_Order_First => 0, |
576 | System.Low_Order_First => 1]; |
577 | |
578 | UCS_4 : constant Known_Character_Encoding := |
579 | (True, |
580 | Encoding_ID'Val |
581 | (Encoding_ID'Pos (UCS_4_BE_ID) + |
582 | Endianness_Shift (System.Default_Bit_Order))); |
583 | UCS_2 : constant Known_Character_Encoding := |
584 | (True, |
585 | Encoding_ID'Val |
586 | (Encoding_ID'Pos (UCS_2_BE_ID) + |
587 | Endianness_Shift (System.Default_Bit_Order))); |
588 | UTF_16 : constant Known_Character_Encoding := |
589 | (True, |
590 | Encoding_ID'Val |
591 | (Encoding_ID'Pos (UTF_16_BE_ID) + |
592 | Endianness_Shift (System.Default_Bit_Order))); |
593 | |
594 | end Character_Encodings; |
595 | |
596 | --------------------------------------------------------------------------- |
597 | |
598 | ------------------------- |
599 | -- Encoding management -- |
600 | ------------------------- |
601 | |
602 | use Character_Encodings; |
603 | |
604 | function Transcode |
605 | (Source : in EAstring; |
606 | New_Encoding : in Character_Encoding) |
607 | return EAstring; |
608 | -- Returns a copy of Source encoded in New_Encoding. |
609 | |
610 | procedure Transcode |
611 | (Source : in out EAstring; |
612 | New_Encoding : in Character_Encoding); |
613 | -- Makes sure that Source is encoded in New_Encoding. |
614 | |
615 | function Transcode |
616 | (Source : in EAstring_Array; |
617 | New_Encoding : in Character_Encoding) |
618 | return EAstring_Array; |
619 | -- Returns a copy of Source where each string is encoded in New_Encoding. |
620 | |
621 | procedure Transcode |
622 | (Source : in out EAstring_Array; |
623 | New_Encoding : in Character_Encoding); |
624 | -- Makes sure that each string in Source is encoded in New_Encoding. |
625 | |
626 | function Open_Converter |
627 | (From, To : Character_Encoding) |
628 | return Converter; |
629 | -- Returns a freshly allocated converter object that converts text from |
630 | -- the encoding given in From to the one in To. The converter must be |
631 | -- freed with Close_Converter. Unsupported_Conversion is raised if the |
632 | -- operating system doesn't support this conversion. |
633 | |
634 | procedure Convert |
635 | (State : in Converter; |
636 | Source : in Byte_Sequence; |
637 | Target : out Byte_Sequence; |
638 | Source_Last : out Natural; |
639 | Target_Last : out Natural; |
640 | Cause : out Conversion_Stop_Cause); |
641 | -- State must be a converter object allocated by Open_Converter. Source |
642 | -- must be a string encoded in the encoding that this converter converts |
643 | -- from. The string is converted and stored in Target. The indices in |
644 | -- Source and in Target of the last byte of the last character that was |
645 | -- converted are stored in Source_Last and Target_Last, respectively. The |
646 | -- reason why Convert stopped is reported in Cause. |
647 | |
648 | procedure Reset_Converter (Item : in Converter); |
649 | -- Resets a converter object to its initial state. |
650 | |
651 | procedure Close_Converter (Item : in out Converter); |
652 | -- Frees a converter object allocated by Open_Converter. |
653 | |
654 | function To_EAstring |
655 | (Source : in Byte_Sequence; |
656 | Encoding : in Character_Encoding) |
657 | return EAstring; |
658 | -- Constructs an EAstring from a byte sequence and a character encoding. |
659 | -- Source had better be a valid encoded text in Encoding. |
660 | |
661 | function Encoding (Source : in EAstring) return Character_Encoding; |
662 | -- Returns the encoding that an EAstring is currently encoded in. |
663 | |
664 | function Bytes (Source : in EAstring) return Byte_Sequence; |
665 | -- Returns the byte sequence that an EAstring is encoded as in its current |
666 | -- encoding. |
667 | |
668 | function Size (Source : EAstring) return Natural; |
669 | -- Returns the number of bytes in the byte sequence that the string is |
670 | -- currently encoded as. |
671 | |
672 | function Byte_Sequence_To_Fake_String |
673 | (Source : in Byte_Sequence) |
674 | return String; |
675 | -- Unchecked conversion of Byte_Sequence to String. The result will not be |
676 | -- a real String (unless the byte sequence happens to be in Latin 1). For |
677 | -- passing strings to functions that take String parameters but interpret |
678 | -- them as some other encoding. |
679 | |
680 | function Fake_String_To_Byte_Sequence |
681 | (Source : in String) |
682 | return Byte_Sequence; |
683 | -- Unchecked conversion of String to Byte_Sequence. For making EAstrings |
684 | -- from objects that are declared as String but contain some other |
685 | -- encoding than Latin 1. |
686 | |
687 | ---------------- |
688 | -- Exceptions -- |
689 | ---------------- |
690 | |
691 | Unsupported_Conversion : exception; |
692 | -- An attempt was made to do an encoding conversion that the operating |
693 | -- system doesn't support. Probably the target encoding name wasn't |
694 | -- recognized. |
695 | |
696 | Incomplete_Byte_Sequence : exception; |
697 | -- The string ends in an incomplete code sequence. Maybe it has been cut |
698 | -- off in the wrong place. |
699 | |
700 | Conversion_Impossible : exception; |
701 | -- One of these errors happened during a conversion: 1: The byte sequence |
702 | -- isn't a valid encoded string for its associated |
703 | -- encoding. |
704 | -- 2: The string contains a character that can't be represented in the |
705 | -- target encoding. |
706 | |
707 | Other_Error : exception; |
708 | -- An undocumented error happened in a C function. |
709 | |
710 | --------------------------------------------------------------------------- |
711 | ----- |
712 | private |
713 | --------------------------------------------------------------------------- |
714 | ----- |
715 | |
716 | package AF renames Ada.Finalization; |
717 | |
718 | type Converter is new System.Address; |
719 | |
720 | Null_Sequence : aliased Byte_Sequence := [1 .. 0 => 0]; |
721 | |
722 | Null_String_Encoding : constant Character_Encoding := US_ASCII; |
723 | -- The null string is set to the least demanding encoding to avoid |
724 | -- unnecessarily transcoding other strings. |
725 | |
726 | type Byte_Sequence_Access is access all Byte_Sequence; |
727 | |
728 | type EAstring is new AF.Controlled with record |
729 | Encoding : Character_Encoding := Null_String_Encoding; |
730 | Reference : Byte_Sequence_Access := Null_Sequence'Access; |
731 | Last : Natural := 0; |
732 | end record; |
733 | |
734 | -- EAstrings uses a buffered implementation to increase the speed of the |
735 | -- string manipulation procedures. The Byte_Sequence that Reference points |
736 | -- to contains the current encoded string value and extra room at the end |
737 | -- to be used by the next Append routine. Last is the index of the last |
738 | -- byte of the string. So the current byte sequence is really Reference(1 |
739 | -- .. Last). |
740 | |
741 | -- pragma Stream_Convert(EAstring, To_EA, To_String); |
742 | |
743 | pragma Finalize_Storage_Only (EAstring); |
744 | |
745 | overriding procedure Initialize (Object : in out EAstring); |
746 | overriding procedure Adjust (Object : in out EAstring); |
747 | overriding procedure Finalize (Object : in out EAstring); |
748 | |
749 | procedure Realloc_For_Chunk |
750 | (Source : in out EAstring; |
751 | Chunk_Size : Natural); |
752 | pragma Inline (Realloc_For_Chunk); |
753 | -- Adjust the size allocated for the string. Add at least Chunk_Size so it |
754 | -- is safe to add a byte sequence of this size at the end of the current |
755 | -- content. The real size allocated for the byte sequence is Chunk_Size + |
756 | -- x % of the current byte sequence size. This buffered handling makes the |
757 | -- Append routines very fast. |
758 | |
759 | Null_EAstring : constant EAstring := |
760 | (AF.Controlled with |
761 | Encoding => Null_String_Encoding, |
762 | Reference => Null_Sequence'Access, |
763 | Last => 0); |
764 | |
765 | Null_Converter : constant Converter := Converter (System.Null_Address); |
766 | |
767 | end AdaCL.EAstrings; |