adacl-eastrings.ads

1-----------------------------------------------------------------------------
2--
3-- Copyright 2004 Björn Persson.
4--
5-- This library is free software; you can redistribute it and/or modify it
6-- under the terms of the GNU General Public License, version 2, as published
7-- by the Free Software Foundation.
8--
9-- As a special exception, if other files instantiate generics from this
10-- unit, or you link this unit with other files to produce an executable,
11-- this unit does not by itself cause the resulting executable to be covered
12-- by the General Public License. This exception does not however invalidate
13-- any other reasons why the executable file might be covered by the General
14-- Public License.
15--
16-- This package was based on Gnat's implementation of Ada.Strings.Unbounded,
17-- whose specification was in turn derived from the Ada Reference Manual. The
18-- copyright of Gnat belongs to the Free Software Foundation.
19--
20----------------------------------------------------------------------------
21
22pragma License (Modified_Gpl);
23pragma Ada_2022;
24
25with Ada.Strings.Unbounded;
26with Ada.Finalization;
27with Ada.Strings.Maps;
28with Interfaces;
29with System;
30with AdaCL.OS.Low_Level;
31
32package AdaCL.EAstrings is
33 --
34 -- EAstring - encoding-aware string - is a dynamic string type similar to
35 -- Ada.Strings.Unbounded.Unbounded_String, but it handles strings in all
36 -- character encodings and frees you from character encoding troubles. It
37 -- keeps track of how each string is encoded and transcodes them
38 -- automatically when necessary.
39 --
40 -- The child packages Latin_1 and UCS_2 contain subprograms for using
41 -- EAstrings together with Ada's standard string types. Latin_1 is for
42 -- String and Unbounded_String, and UCS_2 is for Wide_String and
43 -- Unbounded_Wide_String.
44 --
45 -- The child package IO provides encoding-aware input and output.
46 --
47 -- The child package OS and its descendants provide encoding-aware access
48 -- to information from the environment.
49 --
50 -- The procedures and functions that manipulate EAstrings will transcode
51 -- them as needed. The procedures may change the encoding of their
52 -- parameters when these are declared "in out". The functions will leave
53 -- their parameters unchanged but may return EAstrings in other encodings.
54 -- There are also subprograms available that allow for explicit
55 -- transcoding (Transcode and Convert). In most cases it should not be
56 -- necessary to use them, but if you need to convert an EAstring to a byte
57 -- sequence in a particular encoding you should call Transcode to make
58 -- sure that it is in the right encoding, and then retrieve the byte
59 -- sequence with Bytes. You may also want to call Transcode for
60 -- performance reasons in certain cases. A series of function calls may
61 -- lead to the same string being transcoded over and over, and this can be
62 -- prevented by explicitly transcoding it first.
63 --
64 -- The first character in an EAstring always has the index 1.
65 --
66 -- In this package, a character is a Unicode code point; not a byte, a
67 -- code unit or a grapheme. In some encodings, a code unit is one byte. In
68 -- others it's two or four bytes. In some encodings, one character (code
69 -- point) can be encoded in several code units. A grapheme can be
70 -- represented as one character or as a base character followed by one or
71 -- more combining characters. Most of the subprograms in this package
72 -- count, compare and process code points. A few deal with bytes.
73 --
74 -- Througout the EAstrings library, it is assumed that a byte is eight
75 -- bits.
76 --
77
78 -------------------------
79 -- types and constants --
80 -------------------------
81
82 use Ada.Strings;
83
84 type EAstring is private;
85
86 type EAstring_Array is array (Integer range <>) of EAstring;
87
88 type Byte_Sequence is
89 array (Positive range <>) of aliased Interfaces.Unsigned_8;
90 for Byte_Sequence'Component_Size use 8;
91
92 type Converter is private;
93
94 type Conversion_Stop_Cause is (
95 All_Done,
96 Target_Full,
97 Incomplete,
98 Inconvertible);
99 -- Reason why Convert returns.
100 -- All_Done : The whole input string was converted.
101 -- Target_Full : There isn't enough room in the target.
102 -- Incomplete : The source ends in an incomplete code sequence.
103 -- Inconvertible : One of these errors happened:
104 -- 1: The source isn't a valid encoded string for the
105 -- source encoding.
106 -- 2: The source contains a character that can't be
107 -- represented in the target encoding.
108
109 Null_EAstring : constant EAstring;
110
111 Null_Converter : constant Converter;
112
113 ----------------------------------------------------------
114 -- Conversion, Concatenation, and Selection Subprograms --
115 ----------------------------------------------------------
116
117 function Length (Source : EAstring) return Natural;
118 -- Returns the number of characters in the string. (This is not the same
119 -- as the space it will take when printed, even in a fixed-width font.)
120 -- This may be a more expensive operation than one might think, because if
121 -- the string is in a variable-width encoding, a copy of it must be
122 -- converted to a fixed-width encoding before the characters can be
123 -- counted.
124
125 function To_EAstring (Length : in Natural) return EAstring;
126 -- Returns a string of Length spaces.
127
128 procedure Append (Source : in out EAstring; New_Item : in EAstring);
129 -- Appends New_Item to the end of Source.
130
131 function "&" (Left, Right : in EAstring) return EAstring;
132 -- Concatenates two EAstrings.
133
134 function Slice
135 (Source : in EAstring;
136 Low : in Positive;
137 High : in Natural)
138 return EAstring;
139 -- Slice returns the characters Low through High in Source. It returns
140 -- Null_EAstring if Low > High, and raises Index_Error if Low <= High and
141 -- High > Length(Source).
142
143 overriding function "=" (Left, Right : in EAstring) return Boolean;
144 -- Returns True if and only if Left and Right contain identical sequences
145 -- of characters (regardless of their current encodings).
146
147 -- function "<"(Left, Right : in EAstring) return Boolean;
148 --
149 -- function "<"
150 -- (Left : in EAstring;
151 -- Right : in String)
152 -- return Boolean;
153 --
154 -- function "<"
155 -- (Left : in String;
156 -- Right : in EAstring)
157 -- return Boolean;
158 --
159 -- function "<="(Left, Right : in EAstring) return Boolean;
160 --
161 -- function "<="
162 -- (Left : in EAstring;
163 -- Right : in String)
164 -- return Boolean;
165 --
166 -- function "<="
167 -- (Left : in String;
168 -- Right : in EAstring)
169 -- return Boolean;
170 --
171 -- function ">"(Left, Right : in EAstring) return Boolean;
172 --
173 -- function ">"
174 -- (Left : in EAstring;
175 -- Right : in String)
176 -- return Boolean;
177 --
178 -- function ">"
179 -- (Left : in String;
180 -- Right : in EAstring)
181 -- return Boolean;
182 --
183 -- function ">="(Left, Right : in EAstring) return Boolean;
184 --
185 -- function ">="
186 -- (Left : in EAstring;
187 -- Right : in String)
188 -- return Boolean;
189 --
190 -- function ">="
191 -- (Left : in String;
192 -- Right : in EAstring)
193 -- return Boolean;
194
195 ------------------------
196 -- Search Subprograms --
197 ------------------------
198
199 function Index
200 (Source : in EAstring;
201 Pattern : in EAstring;
202 Going : in Direction := Forward)
203 -- Mapping : in Maps.Character_Mapping := Maps.Identity) to be added
204 -- later
205 return Natural;
206 -- Searches for a slice of Source, with the same length as Pattern, that
207 -- matches Pattern with respect to Mapping. The parameter Going indicates
208 -- the direction of the lookup. Index returns the smallest (if Going =
209 -- Forward) or largest (if Going = Backward) index I such that the slice
210 -- of Source starting at I matches Pattern. If there is no such slice,
211 -- then 0 is returned. If Pattern is the null string then Pattern_Error is
212 -- propagated.
213
214 -- function Index
215 -- (Source : in EAstring;
216 -- Pattern : in String;
217 -- Going : in Direction := Forward;
218 -- Mapping : in Maps.Character_Mapping_Function)
219 -- return Natural;
220 --
221 -- function Index
222 -- (Source : in EAstring;
223 -- Set : in Maps.Character_Set;
224 -- Test : in Membership := Inside;
225 -- Going : in Direction := Forward)
226 -- return Natural;
227
228 function Index_Non_Blank
229 (Source : in EAstring;
230 Going : in Direction := Forward)
231 return Natural;
232 -- Returns the index of the first (if Going = Forward) or last (if Going =
233 -- Backward) character in Source that is not a space (U+0020). If there is
234 -- no such character, then 0 is returned.
235
236 function Count
237 (Source : in EAstring;
238 Pattern : in EAstring)
239 -- Mapping : in Maps.Character_Mapping := Maps.Identity) to be added
240 -- later
241 return Natural;
242 -- Returns the maximum number of nonoverlapping slices of Source that
243 -- match Pattern with respect to Mapping. If Pattern is the null string
244 -- then Pattern_Error is propagated.
245
246 -- function Count
247 -- (Source : in EAstring;
248 -- Pattern : in String;
249 -- Mapping : in Maps.Character_Mapping_Function)
250 -- return Natural;
251 --
252 -- function Count
253 -- (Source : in EAstring;
254 -- Set : in Maps.Character_Set)
255 -- return Natural;
256 --
257 -- procedure Find_Token
258 -- (Source : in EAstring;
259 -- Set : in Maps.Character_Set;
260 -- Test : in Membership;
261 -- First : out Positive;
262 -- Last : out Natural);
263 --
264 -- ------------------------------------
265 -- -- String Translation Subprograms --
266 -- ------------------------------------
267 --
268 -- function Translate
269 -- (Source : in EAstring;
270 -- Mapping : in Maps.Character_Mapping)
271 -- return EAstring;
272 --
273 -- procedure Translate
274 -- (Source : in out EAstring;
275 -- Mapping : Maps.Character_Mapping);
276 --
277 -- function Translate
278 -- (Source : in EAstring;
279 -- Mapping : in Maps.Character_Mapping_Function)
280 -- return EAstring;
281 --
282 -- procedure Translate
283 -- (Source : in out EAstring;
284 -- Mapping : in Maps.Character_Mapping_Function);
285
286 ---------------------------------------
287 -- String Transformation Subprograms --
288 ---------------------------------------
289
290 function Replace_Slice
291 (Source : in EAstring;
292 Low : in Positive;
293 High : in Natural;
294 By : in EAstring)
295 return EAstring;
296 -- If Low > Length(Source)+1, then Index_Error is propagated. Otherwise: �
297 -- If High >= Low, then the returned string comprises
298 -- Slice(Source, 1, Low-1) & By & Slice(Source, High+1, Length(Source)).
299 -- � If High < Low, then the returned string is Insert(Source, Low, By).
300
301 procedure Replace_Slice
302 (Source : in out EAstring;
303 Low : in Positive;
304 High : in Natural;
305 By : in EAstring);
306 -- Like above, but changes Source instead of returning a new string.
307
308 function Insert
309 (Source : in EAstring;
310 Before : in Positive;
311 New_Item : in EAstring)
312 return EAstring;
313 -- Propagates Index_Error if Before > Length(Source)+1; otherwise returns
314 -- Slice(Source, 1, Before-1) & New_Item & Slice(Source, Before,
315 -- Length(Source)).
316
317 procedure Insert
318 (Source : in out EAstring;
319 Before : in Positive;
320 New_Item : in EAstring);
321 -- Like above, but changes Source instead of returning a new string.
322
323 function Overwrite
324 (Source : in EAstring;
325 Position : in Positive;
326 New_Item : in EAstring)
327 return EAstring;
328 -- Propagates Index_Error if Position > Length(Source)+1; otherwise
329 -- returns the string obtained from Source by consecutively replacing
330 -- characters starting at Position with corresponding characters from
331 -- New_Item. If the end of Source is reached before the characters in
332 -- New_Item are exhausted, the remaining characters from New_Item are
333 -- appended to the string.
334
335 procedure Overwrite
336 (Source : in out EAstring;
337 Position : in Positive;
338 New_Item : in EAstring);
339 -- Like above, but changes Source instead of returning a new string.
340
341 function Delete
342 (Source : in EAstring;
343 From : in Positive;
344 Through : in Natural)
345 return EAstring;
346 -- Returns Replace_Slice(Source, From, Through, Null_EAstring).
347
348 procedure Delete
349 (Source : in out EAstring;
350 From : in Positive;
351 Through : in Natural);
352 -- Like above, but changes Source instead of returning a new string.
353
354 function Trim
355 (Source : in EAstring;
356 Side : in Trim_End)
357 return EAstring;
358 -- Returns the EAstring obtained by removing from Source all leading (if
359 -- Side = Left), trailing (if Side = Right), or leading and trailing (if
360 -- Side = Both) space characters (U+0020).
361
362 procedure Trim (Source : in out EAstring; Side : in Trim_End);
363 -- Like above, but changes Source instead of returning a new string.
364
365 -- function Trim
366 -- (Source : in EAstring;
367 -- Left : in Maps.Character_Set;
368 -- Right : in Maps.Character_Set)
369 -- return EAstring;
370 --
371 -- procedure Trim
372 -- (Source : in out EAstring;
373 -- Left : in Maps.Character_Set;
374 -- Right : in Maps.Character_Set);
375
376 function Head
377 (Source : in EAstring;
378 Count : in Natural)
379 -- : in Unicode_Character := Space" to be added later.
380 return EAstring;
381 -- Returns an EAstring of length Count. If Count <= Length(Source), the
382 -- string comprises the first Count characters of Source. Otherwise its
383 -- contents are Source concatenated with Count - Length(Source) spaces.
384
385 procedure Head (Source : in out EAstring; Count : in Natural);
386 -- "Pad : in Unicode_Character := Space" to be added later.
387 -- Like above, but changes Source instead of returning a new string.
388
389 function Tail
390 (Source : in EAstring;
391 Count : in Natural)
392 -- : in Unicode_Character := Space" to be added later.
393 return EAstring;
394 -- Returns an EAstring of length Count. If Count <= Length(Source), the
395 -- string comprises the last Count characters of Source. Otherwise its
396 -- contents are Count - Length(Source) spaces concatenated with Source.
397
398 procedure Tail (Source : in out EAstring; Count : in Natural);
399 -- "Pad : in Unicode_Character := Space" to be added later.
400 -- Like above, but changes Source instead of returning a new string.
401
402 function "*" (Left : in Natural; Right : in EAstring) return EAstring;
403 -- Returns an EAstring that comprises Left concatenated copies of Right.
404
405 ---------------------------------------------------------------------------
406 -- package Character_Encodings
407 ---------------------------------------------------------------------------
408
409 package Character_Encodings is
410
411 use Ada.Strings.Unbounded;
412
413 type Character_Encoding (Known : Boolean := False) is private;
414
415 subtype Known_Character_Encoding is Character_Encoding (True);
416
417 -- Note: If Gnat crashes on a declaration of a Character_Encoding
418 -- variable, try initializing the variable at the declaration. (This
419 -- behaviour was observed with GCC-Gnat 3.4.0.)
420
421 subtype Character_Width is Positive range 1 .. 4;
422
423 UCS_4_BE, UCS_4_LE, UCS_4, UCS_2_BE, UCS_2_LE, UCS_2, UTF_16_BE,
424UTF_16_LE, UTF_16, UTF_8, US_ASCII, Latin_1 : constant
425 Known_Character_Encoding;
426 -- These are the encodings that this package knows some characteristics
427 -- of. They are therefore handled more efficiently than other encodings
428 -- in some situations. "BE" and "LE" means big-endian and
429 -- little-endian. UCS_4, UCS_2 and UTF_16 will have the machine's
430 -- native byte order.
431
432 overriding function "=" (Left, Right : Character_Encoding) return Boolean;
433 -- Redefining "=" shouldn't be necessary, but it works around a bug in
434 -- GCC-Gnat 3.4.0.
435
436 function Name
437 (Encoding : Character_Encoding)
438 return Unbounded_String;
439
440 function Encoding_By_Name (Name : String) return Character_Encoding;
441
442 function Fixed_Width (Encoding : Character_Encoding) return Boolean;
443 -- Returns true if Encoding is a Known_Character_Encoding and uses the
444 -- same number of bytes for all characters, false otherwise.
445
446 function Width
447 (Encoding : Known_Character_Encoding)
448 return Character_Width;
449 -- Returns the number of bytes used for one character in this encoding.
450 -- Propagates Constraint_Error if the character width is variable.
451
452 function ASCII_Compatible
453 (Encoding : Character_Encoding)
454 return Boolean;
455 -- Returns true if and only if Encoding is a Known_Character_Encoding
456 -- and is compatible with ASCII so that all the characters in ASCII are
457 -- encoded with the same numbers in this encoding as in ASCII, and any
458 -- valid ASCII text is also a valid text in this encoding.
459
460 function Subset_Of_UCS_2
461 (Encoding : Character_Encoding)
462 return Boolean;
463 -- Returns true if and only if Encoding is a Known_Character_Encoding
464 -- and all the characters that can be represented in this encoding can
465 -- also be represented in UCS-2 (but mostly with different numbers).
466 -- (Note that UCS-2 itself fits the criteria.)
467
468 ------------------------------------------------------------------------
469 -----
470 private
471 ------------------------------------------------------------------------
472 -----
473
474 use AdaCL.OS.Low_Level;
475 use Interfaces;
476 use Ada.Strings.Maps;
477
478 type Encoding_Data is record
479 Name : Unbounded_String;
480 Windows_Number : Unsigned_16;
481 Width : Natural range 0 .. 4; -- 0 for variable
482 ASCII_Compatible : Boolean;
483 Subset_Of_UCS_2 : Boolean;
484 end record;
485
486 type Encoding_ID is (
487 UCS_4_BE_ID,
488 UCS_4_LE_ID,
489 UCS_2_BE_ID,
490 UCS_2_LE_ID,
491 UTF_16_BE_ID,
492 UTF_16_LE_ID,
493 UTF_8_ID,
494 ASCII_ID,
495 Latin_1_ID,
496 CP_850_ID,
497 Windows_1252_ID);
498
499 Encoding_Info : constant array (Encoding_ID) of Encoding_Data :=
500 [(To_Unbounded_String ("UCS-4BE"), 12001, 4, False, False),
501 (To_Unbounded_String ("UCS-4LE"), 12000, 4, False, False),
502 (To_Unbounded_String ("UCS-2BE"), 1201, 2, False, True),
503 (To_Unbounded_String ("UCS-2LE"), 1200, 2, False, True),
504 (To_Unbounded_String ("UTF-16BE"), 0, 0, False, False),
505 (To_Unbounded_String ("UTF-16LE"), 0, 0, False, False),
506 (To_Unbounded_String ("UTF-8"), 65001, 0, True, False),
507 (To_Unbounded_String ("US-ASCII"), 20127, 1, True, True),
508 (To_Unbounded_String ("ISO-8859-1"), 28591, 1, True, True),
509 (To_Unbounded_String ("IBM850"), 850, 1, True, True),
510 (To_Unbounded_String ("WINDOWS-1252"), 1252, 1, True, True)];
511
512 type Encoding_Alias is record
513 Name : Unbounded_String;
514 ID : Encoding_ID;
515 end record;
516
517 Encoding_Aliases : constant
518 array (Positive range <>) of Encoding_Alias :=
519 [(To_Unbounded_String ("ANSI-X3.4-1968"), ASCII_ID),
520 (To_Unbounded_String ("ASCII"), ASCII_ID),
521 (To_Unbounded_String ("CP-850"), CP_850_ID),
522 (To_Unbounded_String ("CP850"), CP_850_ID),
523 (To_Unbounded_String ("IBM850"), CP_850_ID),
524 (To_Unbounded_String ("ISO-8859-1"), Latin_1_ID),
525 (To_Unbounded_String ("LATIN-1"), Latin_1_ID),
526 (To_Unbounded_String ("LATIN1"), Latin_1_ID),
527 (To_Unbounded_String ("UCS-2BE"), UCS_2_BE_ID),
528 (To_Unbounded_String ("UCS-2LE"), UCS_2_LE_ID),
529 (To_Unbounded_String ("UCS-4BE"), UCS_4_BE_ID),
530 (To_Unbounded_String ("UCS-4LE"), UCS_4_LE_ID),
531 (To_Unbounded_String ("US-ASCII"), ASCII_ID),
532 (To_Unbounded_String ("UTF-16BE"), UTF_16_BE_ID),
533 (To_Unbounded_String ("UTF-16LE"), UTF_16_LE_ID),
534 (To_Unbounded_String ("UTF-32BE"), UCS_4_BE_ID),
535 (To_Unbounded_String ("UTF-32LE"), UCS_4_LE_ID),
536 (To_Unbounded_String ("UTF-8"), UTF_8_ID),
537 (To_Unbounded_String ("WINDOWS-1252"), Windows_1252_ID)];
538
539 Normalization_Map : constant Character_Mapping :=
540 To_Mapping
541 (From => "abcdefghijklmnopqrstuvwxyz_",
542 To => "ABCDEFGHIJKLMNOPQRSTUVWXYZ-");
543
544 type Unified_Encoding_Record (Known : Boolean; OS : Known_OS) is record
545 case Known is
546 when True =>
547 Which : Encoding_ID;
548 when False =>
549 case OS is
550 when Linux | MacOS =>
551 Name : Unbounded_String;
552 -- Names of encodings are confined to ASCII.
553 when Windows =>
554 Number : Unsigned_16;
555 end case;
556 end case;
557 end record;
558
559 type Character_Encoding (Known : Boolean := False) is new
560 Unified_Encoding_Record (Known => Known, OS => This_OS);
561
562 UCS_4_BE : constant Known_Character_Encoding := (True, UCS_4_BE_ID);
563 UCS_4_LE : constant Known_Character_Encoding := (True, UCS_4_LE_ID);
564 UCS_2_BE : constant Known_Character_Encoding := (True, UCS_2_BE_ID);
565 UCS_2_LE : constant Known_Character_Encoding := (True, UCS_2_LE_ID);
566 UTF_16_BE : constant Known_Character_Encoding := (True, UTF_16_BE_ID);
567 UTF_16_LE : constant Known_Character_Encoding := (True, UTF_16_LE_ID);
568 UTF_8 : constant Known_Character_Encoding := (True, UTF_8_ID);
569 US_ASCII : constant Known_Character_Encoding := (True, ASCII_ID);
570 Latin_1 : constant Known_Character_Encoding := (True, Latin_1_ID);
571
572 -- Define UCS_4, UCS_2 and UTF_16 with the macine's native byte order.
573
574 Endianness_Shift : constant array (System.Bit_Order) of Natural :=
575 [System.High_Order_First => 0,
576 System.Low_Order_First => 1];
577
578 UCS_4 : constant Known_Character_Encoding :=
579 (True,
580 Encoding_ID'Val
581 (Encoding_ID'Pos (UCS_4_BE_ID) +
582 Endianness_Shift (System.Default_Bit_Order)));
583 UCS_2 : constant Known_Character_Encoding :=
584 (True,
585 Encoding_ID'Val
586 (Encoding_ID'Pos (UCS_2_BE_ID) +
587 Endianness_Shift (System.Default_Bit_Order)));
588 UTF_16 : constant Known_Character_Encoding :=
589 (True,
590 Encoding_ID'Val
591 (Encoding_ID'Pos (UTF_16_BE_ID) +
592 Endianness_Shift (System.Default_Bit_Order)));
593
594 end Character_Encodings;
595
596 ---------------------------------------------------------------------------
597
598 -------------------------
599 -- Encoding management --
600 -------------------------
601
602 use Character_Encodings;
603
604 function Transcode
605 (Source : in EAstring;
606 New_Encoding : in Character_Encoding)
607 return EAstring;
608 -- Returns a copy of Source encoded in New_Encoding.
609
610 procedure Transcode
611 (Source : in out EAstring;
612 New_Encoding : in Character_Encoding);
613 -- Makes sure that Source is encoded in New_Encoding.
614
615 function Transcode
616 (Source : in EAstring_Array;
617 New_Encoding : in Character_Encoding)
618 return EAstring_Array;
619 -- Returns a copy of Source where each string is encoded in New_Encoding.
620
621 procedure Transcode
622 (Source : in out EAstring_Array;
623 New_Encoding : in Character_Encoding);
624 -- Makes sure that each string in Source is encoded in New_Encoding.
625
626 function Open_Converter
627 (From, To : Character_Encoding)
628 return Converter;
629 -- Returns a freshly allocated converter object that converts text from
630 -- the encoding given in From to the one in To. The converter must be
631 -- freed with Close_Converter. Unsupported_Conversion is raised if the
632 -- operating system doesn't support this conversion.
633
634 procedure Convert
635 (State : in Converter;
636 Source : in Byte_Sequence;
637 Target : out Byte_Sequence;
638 Source_Last : out Natural;
639 Target_Last : out Natural;
640 Cause : out Conversion_Stop_Cause);
641 -- State must be a converter object allocated by Open_Converter. Source
642 -- must be a string encoded in the encoding that this converter converts
643 -- from. The string is converted and stored in Target. The indices in
644 -- Source and in Target of the last byte of the last character that was
645 -- converted are stored in Source_Last and Target_Last, respectively. The
646 -- reason why Convert stopped is reported in Cause.
647
648 procedure Reset_Converter (Item : in Converter);
649 -- Resets a converter object to its initial state.
650
651 procedure Close_Converter (Item : in out Converter);
652 -- Frees a converter object allocated by Open_Converter.
653
654 function To_EAstring
655 (Source : in Byte_Sequence;
656 Encoding : in Character_Encoding)
657 return EAstring;
658 -- Constructs an EAstring from a byte sequence and a character encoding.
659 -- Source had better be a valid encoded text in Encoding.
660
661 function Encoding (Source : in EAstring) return Character_Encoding;
662 -- Returns the encoding that an EAstring is currently encoded in.
663
664 function Bytes (Source : in EAstring) return Byte_Sequence;
665 -- Returns the byte sequence that an EAstring is encoded as in its current
666 -- encoding.
667
668 function Size (Source : EAstring) return Natural;
669 -- Returns the number of bytes in the byte sequence that the string is
670 -- currently encoded as.
671
672 function Byte_Sequence_To_Fake_String
673 (Source : in Byte_Sequence)
674 return String;
675 -- Unchecked conversion of Byte_Sequence to String. The result will not be
676 -- a real String (unless the byte sequence happens to be in Latin 1). For
677 -- passing strings to functions that take String parameters but interpret
678 -- them as some other encoding.
679
680 function Fake_String_To_Byte_Sequence
681 (Source : in String)
682 return Byte_Sequence;
683 -- Unchecked conversion of String to Byte_Sequence. For making EAstrings
684 -- from objects that are declared as String but contain some other
685 -- encoding than Latin 1.
686
687 ----------------
688 -- Exceptions --
689 ----------------
690
691 Unsupported_Conversion : exception;
692 -- An attempt was made to do an encoding conversion that the operating
693 -- system doesn't support. Probably the target encoding name wasn't
694 -- recognized.
695
696 Incomplete_Byte_Sequence : exception;
697 -- The string ends in an incomplete code sequence. Maybe it has been cut
698 -- off in the wrong place.
699
700 Conversion_Impossible : exception;
701 -- One of these errors happened during a conversion: 1: The byte sequence
702 -- isn't a valid encoded string for its associated
703 -- encoding.
704 -- 2: The string contains a character that can't be represented in the
705 -- target encoding.
706
707 Other_Error : exception;
708 -- An undocumented error happened in a C function.
709
710 ---------------------------------------------------------------------------
711 -----
712private
713 ---------------------------------------------------------------------------
714 -----
715
716 package AF renames Ada.Finalization;
717
718 type Converter is new System.Address;
719
720 Null_Sequence : aliased Byte_Sequence := [1 .. 0 => 0];
721
722 Null_String_Encoding : constant Character_Encoding := US_ASCII;
723 -- The null string is set to the least demanding encoding to avoid
724 -- unnecessarily transcoding other strings.
725
726 type Byte_Sequence_Access is access all Byte_Sequence;
727
728 type EAstring is new AF.Controlled with record
729 Encoding : Character_Encoding := Null_String_Encoding;
730 Reference : Byte_Sequence_Access := Null_Sequence'Access;
731 Last : Natural := 0;
732 end record;
733
734 -- EAstrings uses a buffered implementation to increase the speed of the
735 -- string manipulation procedures. The Byte_Sequence that Reference points
736 -- to contains the current encoded string value and extra room at the end
737 -- to be used by the next Append routine. Last is the index of the last
738 -- byte of the string. So the current byte sequence is really Reference(1
739 -- .. Last).
740
741 -- pragma Stream_Convert(EAstring, To_EA, To_String);
742
743 pragma Finalize_Storage_Only (EAstring);
744
745 overriding procedure Initialize (Object : in out EAstring);
746 overriding procedure Adjust (Object : in out EAstring);
747 overriding procedure Finalize (Object : in out EAstring);
748
749 procedure Realloc_For_Chunk
750 (Source : in out EAstring;
751 Chunk_Size : Natural);
752 pragma Inline (Realloc_For_Chunk);
753 -- Adjust the size allocated for the string. Add at least Chunk_Size so it
754 -- is safe to add a byte sequence of this size at the end of the current
755 -- content. The real size allocated for the byte sequence is Chunk_Size +
756 -- x % of the current byte sequence size. This buffered handling makes the
757 -- Append routines very fast.
758
759 Null_EAstring : constant EAstring :=
760 (AF.Controlled with
761 Encoding => Null_String_Encoding,
762 Reference => Null_Sequence'Access,
763 Last => 0);
764
765 Null_Converter : constant Converter := Converter (System.Null_Address);
766
767end AdaCL.EAstrings;