summaryrefslogtreecommitdiffstats
path: root/Objects/stringlib/codecs.h
blob: f353367013a61b4c776676c81bc1b2999b50f654 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
/* stringlib: codec implementations */

#if STRINGLIB_IS_UNICODE

/* Mask to quickly check whether a C 'long' contains a
   non-ASCII, UTF8-encoded char. */
#if (SIZEOF_LONG == 8)
# define ASCII_CHAR_MASK 0x8080808080808080UL
#elif (SIZEOF_LONG == 4)
# define ASCII_CHAR_MASK 0x80808080UL
#else
# error C 'long' size should be either 4 or 8!
#endif

/* 10xxxxxx */
#define IS_CONTINUATION_BYTE(ch) ((ch) >= 0x80 && (ch) < 0xC0)

Py_LOCAL_INLINE(Py_UCS4)
STRINGLIB(utf8_decode)(const char **inptr, const char *end,
                       STRINGLIB_CHAR *dest,
                       Py_ssize_t *outpos)
{
    Py_UCS4 ch;
    const char *s = *inptr;
    const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
    STRINGLIB_CHAR *p = dest + *outpos;

    while (s < end) {
        ch = (unsigned char)*s;

        if (ch < 0x80) {
            /* Fast path for runs of ASCII characters. Given that common UTF-8
               input will consist of an overwhelming majority of ASCII
               characters, we try to optimize for this case by checking
               as many characters as a C 'long' can contain.
               First, check if we can do an aligned read, as most CPUs have
               a penalty for unaligned reads.
            */
            if (_Py_IS_ALIGNED(s, SIZEOF_LONG)) {
                /* Help register allocation */
                register const char *_s = s;
                register STRINGLIB_CHAR *_p = p;
                while (_s < aligned_end) {
                    /* Read a whole long at a time (either 4 or 8 bytes),
                       and do a fast unrolled copy if it only contains ASCII
                       characters. */
                    unsigned long value = *(unsigned long *) _s;
                    if (value & ASCII_CHAR_MASK)
                        break;
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
                    _p[0] = (STRINGLIB_CHAR)(value & 0xFFu);
                    _p[1] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
                    _p[2] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
                    _p[3] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
# if SIZEOF_LONG == 8
                    _p[4] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
                    _p[5] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
                    _p[6] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
                    _p[7] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
# endif
#else
# if SIZEOF_LONG == 8
                    _p[0] = (STRINGLIB_CHAR)((value >> 56) & 0xFFu);
                    _p[1] = (STRINGLIB_CHAR)((value >> 48) & 0xFFu);
                    _p[2] = (STRINGLIB_CHAR)((value >> 40) & 0xFFu);
                    _p[3] = (STRINGLIB_CHAR)((value >> 32) & 0xFFu);
                    _p[4] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
                    _p[5] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
                    _p[6] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
                    _p[7] = (STRINGLIB_CHAR)(value & 0xFFu);
# else
                    _p[0] = (STRINGLIB_CHAR)((value >> 24) & 0xFFu);
                    _p[1] = (STRINGLIB_CHAR)((value >> 16) & 0xFFu);
                    _p[2] = (STRINGLIB_CHAR)((value >> 8) & 0xFFu);
                    _p[3] = (STRINGLIB_CHAR)(value & 0xFFu);
# endif
#endif
                    _s += SIZEOF_LONG;
                    _p += SIZEOF_LONG;
                }
                s = _s;
                p = _p;
                if (s == end)
                    break;
                ch = (unsigned char)*s;
            }
            if (ch < 0x80) {
                s++;
                *p++ = ch;
                continue;
            }
        }

        if (ch < 0xE0) {
            /* \xC2\x80-\xDF\xBF -- 0080-07FF */
            Py_UCS4 ch2;
            if (ch < 0xC2) {
                /* invalid sequence
                \x80-\xBF -- continuation byte
                \xC0-\xC1 -- fake 0000-007F */
                goto InvalidStart;
            }
            if (end - s < 2) {
                /* unexpected end of data: the caller will decide whether
                   it's an error or not */
                break;
            }
            ch2 = (unsigned char)s[1];
            if (!IS_CONTINUATION_BYTE(ch2))
                /* invalid continuation byte */
                goto InvalidContinuation1;
            ch = (ch << 6) + ch2 -
                 ((0xC0 << 6) + 0x80);
            assert ((ch > 0x007F) && (ch <= 0x07FF));
            s += 2;
            if (STRINGLIB_MAX_CHAR <= 0x007F ||
                (STRINGLIB_MAX_CHAR < 0x07FF && ch > STRINGLIB_MAX_CHAR))
                /* Out-of-range */
                goto Return;
            *p++ = ch;
            continue;
        }

        if (ch < 0xF0) {
            /* \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF */
            Py_UCS4 ch2, ch3;
            if (end - s < 3) {
                /* unexpected end of data: the caller will decide whether
                   it's an error or not */
                if (end - s < 2)
                    break;
                ch2 = (unsigned char)s[1];
                if (!IS_CONTINUATION_BYTE(ch2) ||
                    (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED))
                    /* for clarification see comments below */
                    goto InvalidContinuation1;
                break;
            }
            ch2 = (unsigned char)s[1];
            ch3 = (unsigned char)s[2];
            if (!IS_CONTINUATION_BYTE(ch2)) {
                /* invalid continuation byte */
                goto InvalidContinuation1;
            }
            if (ch == 0xE0) {
                if (ch2 < 0xA0)
                    /* invalid sequence
                       \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800 */
                    goto InvalidContinuation1;
            } else if (ch == 0xED && ch2 >= 0xA0) {
                /* Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
                   will result in surrogates in range D800-DFFF. Surrogates are
                   not valid UTF-8 so they are rejected.
                   See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
                   (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
                goto InvalidContinuation1;
            }
            if (!IS_CONTINUATION_BYTE(ch3)) {
                /* invalid continuation byte */
                goto InvalidContinuation2;
            }
            ch = (ch << 12) + (ch2 << 6) + ch3 -
                 ((0xE0 << 12) + (0x80 << 6) + 0x80);
            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
            s += 3;
            if (STRINGLIB_MAX_CHAR <= 0x07FF ||
                (STRINGLIB_MAX_CHAR < 0xFFFF && ch > STRINGLIB_MAX_CHAR))
                /* Out-of-range */
                goto Return;
            *p++ = ch;
            continue;
        }

        if (ch < 0xF5) {
            /* \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF */
            Py_UCS4 ch2, ch3, ch4;
            if (end - s < 4) {
                /* unexpected end of data: the caller will decide whether
                   it's an error or not */
                if (end - s < 2)
                    break;
                ch2 = (unsigned char)s[1];
                if (!IS_CONTINUATION_BYTE(ch2) ||
                    (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4))
                    /* for clarification see comments below */
                    goto InvalidContinuation1;
                if (end - s < 3)
                    break;
                ch3 = (unsigned char)s[2];
                if (!IS_CONTINUATION_BYTE(ch3))
                    goto InvalidContinuation2;
                break;
            }
            ch2 = (unsigned char)s[1];
            ch3 = (unsigned char)s[2];
            ch4 = (unsigned char)s[3];
            if (!IS_CONTINUATION_BYTE(ch2)) {
                /* invalid continuation byte */
                goto InvalidContinuation1;
            }
            if (ch == 0xF0) {
                if (ch2 < 0x90)
                    /* invalid sequence
                       \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF */
                    goto InvalidContinuation1;
            } else if (ch == 0xF4 && ch2 >= 0x90) {
                /* invalid sequence
                   \xF4\x90\x80\80- -- 110000- overflow */
                goto InvalidContinuation1;
            }
            if (!IS_CONTINUATION_BYTE(ch3)) {
                /* invalid continuation byte */
                goto InvalidContinuation2;
            }
            if (!IS_CONTINUATION_BYTE(ch4)) {
                /* invalid continuation byte */
                goto InvalidContinuation3;
            }
            ch = (ch << 18) + (ch2 << 12) + (ch3 << 6) + ch4 -
                 ((0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80);
            assert ((ch > 0xFFFF) && (ch <= 0x10FFFF));
            s += 4;
            if (STRINGLIB_MAX_CHAR <= 0xFFFF ||
                (STRINGLIB_MAX_CHAR < 0x10FFFF && ch > STRINGLIB_MAX_CHAR))
                /* Out-of-range */
                goto Return;
            *p++ = ch;
            continue;
        }
        goto InvalidStart;
    }
    ch = 0;
Return:
    *inptr = s;
    *outpos = p - dest;
    return ch;
InvalidStart:
    ch = 1;
    goto Return;
InvalidContinuation1:
    ch = 2;
    goto Return;
InvalidContinuation2:
    ch = 3;
    goto Return;
InvalidContinuation3:
    ch = 4;
    goto Return;
}

#undef ASCII_CHAR_MASK


/* UTF-8 encoder specialized for a Unicode kind to avoid the slow
   PyUnicode_READ() macro. Delete some parts of the code depending on the kind:
   UCS-1 strings don't need to handle surrogates for example. */
Py_LOCAL_INLINE(PyObject *)
STRINGLIB(utf8_encoder)(PyObject *unicode,
                        STRINGLIB_CHAR *data,
                        Py_ssize_t size,
                        const char *errors)
{
#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */

    Py_ssize_t i;                /* index into s of next input byte */
    PyObject *result;            /* result string object */
    char *p;                     /* next free byte in output buffer */
    Py_ssize_t nallocated;      /* number of result bytes allocated */
    Py_ssize_t nneeded;            /* number of result bytes needed */
#if STRINGLIB_SIZEOF_CHAR > 1
    PyObject *errorHandler = NULL;
    PyObject *exc = NULL;
    PyObject *rep = NULL;
#endif
#if STRINGLIB_SIZEOF_CHAR == 1
    const Py_ssize_t max_char_size = 2;
    char stackbuf[MAX_SHORT_UNICHARS * 2];
#elif STRINGLIB_SIZEOF_CHAR == 2
    const Py_ssize_t max_char_size = 3;
    char stackbuf[MAX_SHORT_UNICHARS * 3];
#else /*  STRINGLIB_SIZEOF_CHAR == 4 */
    const Py_ssize_t max_char_size = 4;
    char stackbuf[MAX_SHORT_UNICHARS * 4];
#endif

    assert(size >= 0);

    if (size <= MAX_SHORT_UNICHARS) {
        /* Write into the stack buffer; nallocated can't overflow.
         * At the end, we'll allocate exactly as much heap space as it
         * turns out we need.
         */
        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
        result = NULL;   /* will allocate after we're done */
        p = stackbuf;
    }
    else {
        if (size > PY_SSIZE_T_MAX / max_char_size) {
            /* integer overflow */
            return PyErr_NoMemory();
        }
        /* Overallocate on the heap, and give the excess back at the end. */
        nallocated = size * max_char_size;
        result = PyBytes_FromStringAndSize(NULL, nallocated);
        if (result == NULL)
            return NULL;
        p = PyBytes_AS_STRING(result);
    }

    for (i = 0; i < size;) {
        Py_UCS4 ch = data[i++];

        if (ch < 0x80) {
            /* Encode ASCII */
            *p++ = (char) ch;

        }
        else
#if STRINGLIB_SIZEOF_CHAR > 1
        if (ch < 0x0800)
#endif
        {
            /* Encode Latin-1 */
            *p++ = (char)(0xc0 | (ch >> 6));
            *p++ = (char)(0x80 | (ch & 0x3f));
        }
#if STRINGLIB_SIZEOF_CHAR > 1
        else if (Py_UNICODE_IS_SURROGATE(ch)) {
            Py_ssize_t newpos;
            Py_ssize_t repsize, k, startpos;
            startpos = i-1;
            rep = unicode_encode_call_errorhandler(
                  errors, &errorHandler, "utf-8", "surrogates not allowed",
                  unicode, &exc, startpos, startpos+1, &newpos);
            if (!rep)
                goto error;

            if (PyBytes_Check(rep))
                repsize = PyBytes_GET_SIZE(rep);
            else
                repsize = PyUnicode_GET_LENGTH(rep);

            if (repsize > max_char_size) {
                Py_ssize_t offset;

                if (result == NULL)
                    offset = p - stackbuf;
                else
                    offset = p - PyBytes_AS_STRING(result);

                if (nallocated > PY_SSIZE_T_MAX - repsize + max_char_size) {
                    /* integer overflow */
                    PyErr_NoMemory();
                    goto error;
                }
                nallocated += repsize - max_char_size;
                if (result != NULL) {
                    if (_PyBytes_Resize(&result, nallocated) < 0)
                        goto error;
                } else {
                    result = PyBytes_FromStringAndSize(NULL, nallocated);
                    if (result == NULL)
                        goto error;
                    Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
                }
                p = PyBytes_AS_STRING(result) + offset;
            }

            if (PyBytes_Check(rep)) {
                char *prep = PyBytes_AS_STRING(rep);
                for(k = repsize; k > 0; k--)
                    *p++ = *prep++;
            } else /* rep is unicode */ {
                enum PyUnicode_Kind repkind;
                void *repdata;

                if (PyUnicode_READY(rep) < 0)
                    goto error;
                repkind = PyUnicode_KIND(rep);
                repdata = PyUnicode_DATA(rep);

                for(k=0; k<repsize; k++) {
                    Py_UCS4 c = PyUnicode_READ(repkind, repdata, k);
                    if (0x80 <= c) {
                        raise_encode_exception(&exc, "utf-8",
                                               unicode,
                                               i-1, i,
                                               "surrogates not allowed");
                        goto error;
                    }
                    *p++ = (char)c;
                }
            }
            Py_CLEAR(rep);
        }
        else
#if STRINGLIB_SIZEOF_CHAR > 2
        if (ch < 0x10000)
#endif
        {
            *p++ = (char)(0xe0 | (ch >> 12));
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
            *p++ = (char)(0x80 | (ch & 0x3f));
        }
#if STRINGLIB_SIZEOF_CHAR > 2
        else /* ch >= 0x10000 */
        {
            assert(ch <= MAX_UNICODE);
            /* Encode UCS4 Unicode ordinals */
            *p++ = (char)(0xf0 | (ch >> 18));
            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
            *p++ = (char)(0x80 | (ch & 0x3f));
        }
#endif /* STRINGLIB_SIZEOF_CHAR > 2 */
#endif /* STRINGLIB_SIZEOF_CHAR > 1 */
    }

    if (result == NULL) {
        /* This was stack allocated. */
        nneeded = p - stackbuf;
        assert(nneeded <= nallocated);
        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
    }
    else {
        /* Cut back to size actually needed. */
        nneeded = p - PyBytes_AS_STRING(result);
        assert(nneeded <= nallocated);
        _PyBytes_Resize(&result, nneeded);
    }

#if STRINGLIB_SIZEOF_CHAR > 1
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
#endif
    return result;

#if STRINGLIB_SIZEOF_CHAR > 1
 error:
    Py_XDECREF(rep);
    Py_XDECREF(errorHandler);
    Py_XDECREF(exc);
    Py_XDECREF(result);
    return NULL;
#endif

#undef MAX_SHORT_UNICHARS
}

/* The pattern for constructing UCS2-repeated masks. */
#if SIZEOF_LONG == 8
# define UCS2_REPEAT_MASK 0x0001000100010001ul
#elif SIZEOF_LONG == 4
# define UCS2_REPEAT_MASK 0x00010001ul
#else
# error C 'long' size should be either 4 or 8!
#endif

/* The mask for fast checking. */
#if STRINGLIB_SIZEOF_CHAR == 1
/* The mask for fast checking of whether a C 'long' contains a
   non-ASCII or non-Latin1 UTF16-encoded characters. */
# define FAST_CHAR_MASK         (UCS2_REPEAT_MASK * (0xFFFFu & ~STRINGLIB_MAX_CHAR))
#else
/* The mask for fast checking of whether a C 'long' may contain
   UTF16-encoded surrogate characters. This is an efficient heuristic,
   assuming that non-surrogate characters with a code point >= 0x8000 are
   rare in most input.
*/
# define FAST_CHAR_MASK         (UCS2_REPEAT_MASK * 0x8000u)
#endif
/* The mask for fast byte-swapping. */
#define STRIPPED_MASK           (UCS2_REPEAT_MASK * 0x00FFu)
/* Swap bytes. */
#define SWAB(value)             ((((value) >> 8) & STRIPPED_MASK) | \
                                 (((value) & STRIPPED_MASK) << 8))

Py_LOCAL_INLINE(Py_UCS4)
STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
                        STRINGLIB_CHAR *dest, Py_ssize_t *outpos,
                        int native_ordering)
{
    Py_UCS4 ch;
    const unsigned char *aligned_end =
            (const unsigned char *) _Py_ALIGN_DOWN(e, SIZEOF_LONG);
    const unsigned char *q = *inptr;
    STRINGLIB_CHAR *p = dest + *outpos;
    /* Offsets from q for retrieving byte pairs in the right order. */
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
    int ihi = !!native_ordering, ilo = !native_ordering;
#else
    int ihi = !native_ordering, ilo = !!native_ordering;
#endif
    --e;

    while (q < e) {
        Py_UCS4 ch2;
        /* First check for possible aligned read of a C 'long'. Unaligned
           reads are more expensive, better to defer to another iteration. */
        if (_Py_IS_ALIGNED(q, SIZEOF_LONG)) {
            /* Fast path for runs of in-range non-surrogate chars. */
            register const unsigned char *_q = q;
            while (_q < aligned_end) {
                unsigned long block = * (unsigned long *) _q;
                if (native_ordering) {
                    /* Can use buffer directly */
                    if (block & FAST_CHAR_MASK)
                        break;
                }
                else {
                    /* Need to byte-swap */
                    if (block & SWAB(FAST_CHAR_MASK))
                        break;
#if STRINGLIB_SIZEOF_CHAR == 1
                    block >>= 8;
#else
                    block = SWAB(block);
#endif
                }
#ifdef BYTEORDER_IS_LITTLE_ENDIAN
# if SIZEOF_LONG == 4
                p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
                p[1] = (STRINGLIB_CHAR)(block >> 16);
# elif SIZEOF_LONG == 8
                p[0] = (STRINGLIB_CHAR)(block & 0xFFFFu);
                p[1] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
                p[2] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
                p[3] = (STRINGLIB_CHAR)(block >> 48);
# endif
#else
# if SIZEOF_LONG == 4
                p[0] = (STRINGLIB_CHAR)(block >> 16);
                p[1] = (STRINGLIB_CHAR)(block & 0xFFFFu);
# elif SIZEOF_LONG == 8
                p[0] = (STRINGLIB_CHAR)(block >> 48);
                p[1] = (STRINGLIB_CHAR)((block >> 32) & 0xFFFFu);
                p[2] = (STRINGLIB_CHAR)((block >> 16) & 0xFFFFu);
                p[3] = (STRINGLIB_CHAR)(block & 0xFFFFu);
# endif
#endif
                _q += SIZEOF_LONG;
                p += SIZEOF_LONG / 2;
            }
            q = _q;
            if (q >= e)
                break;
        }

        ch = (q[ihi] << 8) | q[ilo];
        q += 2;
        if (!Py_UNICODE_IS_SURROGATE(ch)) {
#if STRINGLIB_SIZEOF_CHAR < 2
            if (ch > STRINGLIB_MAX_CHAR)
                /* Out-of-range */
                goto Return;
#endif
            *p++ = (STRINGLIB_CHAR)ch;
            continue;
        }

        /* UTF-16 code pair: */
        if (q >= e)
            goto UnexpectedEnd;
        if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
            goto IllegalEncoding;
        ch2 = (q[ihi] << 8) | q[ilo];
        q += 2;
        if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))
            goto IllegalSurrogate;
        ch = Py_UNICODE_JOIN_SURROGATES(ch, ch2);
#if STRINGLIB_SIZEOF_CHAR < 4
        /* Out-of-range */
        goto Return;
#else
        *p++ = (STRINGLIB_CHAR)ch;
#endif
    }
    ch = 0;
Return:
    *inptr = q;
    *outpos = p - dest;
    return ch;
UnexpectedEnd:
    ch = 1;
    goto Return;
IllegalEncoding:
    ch = 2;
    goto Return;
IllegalSurrogate:
    ch = 3;
    goto Return;
}
#undef UCS2_REPEAT_MASK
#undef FAST_CHAR_MASK
#undef STRIPPED_MASK
#undef SWAB


Py_LOCAL_INLINE(void)
STRINGLIB(utf16_encode)(unsigned short *out,
                        const STRINGLIB_CHAR *in,
                        Py_ssize_t len,
                        int native_ordering)
{
    const STRINGLIB_CHAR *end = in + len;
#if STRINGLIB_SIZEOF_CHAR == 1
# define SWAB2(CH)  ((CH) << 8)
#else
# define SWAB2(CH)  (((CH) << 8) | ((CH) >> 8))
#endif
#if STRINGLIB_MAX_CHAR < 0x10000
    if (native_ordering) {
# if STRINGLIB_SIZEOF_CHAR == 2
        Py_MEMCPY(out, in, 2 * len);
# else
        _PyUnicode_CONVERT_BYTES(STRINGLIB_CHAR, unsigned short, in, end, out);
# endif
    } else {
        const STRINGLIB_CHAR *unrolled_end = in + _Py_SIZE_ROUND_DOWN(len, 4);
        while (in < unrolled_end) {
            out[0] = SWAB2(in[0]);
            out[1] = SWAB2(in[1]);
            out[2] = SWAB2(in[2]);
            out[3] = SWAB2(in[3]);
            in += 4; out += 4;
        }
        while (in < end) {
            *out++ = SWAB2(*in);
            ++in;
        }
    }
#else
    if (native_ordering) {
        while (in < end) {
            Py_UCS4 ch = *in++;
            if (ch < 0x10000)
                *out++ = ch;
            else {
                out[0] = Py_UNICODE_HIGH_SURROGATE(ch);
                out[1] = Py_UNICODE_LOW_SURROGATE(ch);
                out += 2;
            }
        }
    } else {
        while (in < end) {
            Py_UCS4 ch = *in++;
            if (ch < 0x10000)
                *out++ = SWAB2((Py_UCS2)ch);
            else {
                Py_UCS2 ch1 = Py_UNICODE_HIGH_SURROGATE(ch);
                Py_UCS2 ch2 = Py_UNICODE_LOW_SURROGATE(ch);
                out[0] = SWAB2(ch1);
                out[1] = SWAB2(ch2);
                out += 2;
            }
        }
    }
#endif
#undef SWAB2
}
#endif /* STRINGLIB_IS_UNICODE */
buf[-1] in B_CRLF: buf = buf[:-1] buf = buf + B_CRLF conn.sendall(buf) if callback: callback(buf) # shutdown ssl layer if _SSLSocket is not None and isinstance(conn, _SSLSocket): conn.unwrap() return self.voidresp() def acct(self, password): '''Send new account name.''' cmd = 'ACCT ' + password return self.voidcmd(cmd) def nlst(self, *args): '''Return a list of files in a given directory (default the current).''' cmd = 'NLST' for arg in args: cmd = cmd + (' ' + arg) files = [] self.retrlines(cmd, files.append) return files def dir(self, *args): '''List a directory in long form. By default list current directory to stdout. Optional last argument is callback function; all non-empty arguments before it are concatenated to the LIST command. (This *should* only be used for a pathname.)''' cmd = 'LIST' func = None if args[-1:] and type(args[-1]) != type(''): args, func = args[:-1], args[-1] for arg in args: if arg: cmd = cmd + (' ' + arg) self.retrlines(cmd, func) def mlsd(self, path="", facts=[]): '''List a directory in a standardized format by using MLSD command (RFC-3659). If path is omitted the current directory is assumed. "facts" is a list of strings representing the type of information desired (e.g. ["type", "size", "perm"]). Return a generator object yielding a tuple of two elements for every file found in path. First element is the file name, the second one is a dictionary including a variable number of "facts" depending on the server and whether "facts" argument has been provided. ''' if facts: self.sendcmd("OPTS MLST " + ";".join(facts) + ";") if path: cmd = "MLSD %s" % path else: cmd = "MLSD" lines = [] self.retrlines(cmd, lines.append) for line in lines: facts_found, _, name = line.rstrip(CRLF).partition(' ') entry = {} for fact in facts_found[:-1].split(";"): key, _, value = fact.partition("=") entry[key.lower()] = value yield (name, entry) def rename(self, fromname, toname): '''Rename a file.''' resp = self.sendcmd('RNFR ' + fromname) if resp[0] != '3': raise error_reply(resp) return self.voidcmd('RNTO ' + toname) def delete(self, filename): '''Delete a file.''' resp = self.sendcmd('DELE ' + filename) if resp[:3] in {'250', '200'}: return resp else: raise error_reply(resp) def cwd(self, dirname): '''Change to a directory.''' if dirname == '..': try: return self.voidcmd('CDUP') except error_perm as msg: if msg.args[0][:3] != '500': raise elif dirname == '': dirname = '.' # does nothing, but could return error cmd = 'CWD ' + dirname return self.voidcmd(cmd) def size(self, filename): '''Retrieve the size of a file.''' # The SIZE command is defined in RFC-3659 resp = self.sendcmd('SIZE ' + filename) if resp[:3] == '213': s = resp[3:].strip() return int(s) def mkd(self, dirname): '''Make a directory, return its full pathname.''' resp = self.voidcmd('MKD ' + dirname) # fix around non-compliant implementations such as IIS shipped # with Windows server 2003 if not resp.startswith('257'): return '' return parse257(resp) def rmd(self, dirname): '''Remove a directory.''' return self.voidcmd('RMD ' + dirname) def pwd(self): '''Return current working directory.''' resp = self.voidcmd('PWD') # fix around non-compliant implementations such as IIS shipped # with Windows server 2003 if not resp.startswith('257'): return '' return parse257(resp) def quit(self): '''Quit, and close the connection.''' resp = self.voidcmd('QUIT') self.close() return resp def close(self): '''Close the connection without assuming anything about it.''' try: file = self.file self.file = None if file is not None: file.close() finally: sock = self.sock self.sock = None if sock is not None: sock.close() try: import ssl except ImportError: _SSLSocket = None else: _SSLSocket = ssl.SSLSocket class FTP_TLS(FTP): '''A FTP subclass which adds TLS support to FTP as described in RFC-4217. Connect as usual to port 21 implicitly securing the FTP control connection before authenticating. Securing the data connection requires user to explicitly ask for it by calling prot_p() method. Usage example: >>> from ftplib import FTP_TLS >>> ftps = FTP_TLS('ftp.python.org') >>> ftps.login() # login anonymously previously securing control channel '230 Guest login ok, access restrictions apply.' >>> ftps.prot_p() # switch to secure data connection '200 Protection level set to P' >>> ftps.retrlines('LIST') # list directory content securely total 9 drwxr-xr-x 8 root wheel 1024 Jan 3 1994 . drwxr-xr-x 8 root wheel 1024 Jan 3 1994 .. drwxr-xr-x 2 root wheel 1024 Jan 3 1994 bin drwxr-xr-x 2 root wheel 1024 Jan 3 1994 etc d-wxrwxr-x 2 ftp wheel 1024 Sep 5 13:43 incoming drwxr-xr-x 2 root wheel 1024 Nov 17 1993 lib drwxr-xr-x 6 1094 wheel 1024 Sep 13 19:07 pub drwxr-xr-x 3 root wheel 1024 Jan 3 1994 usr -rw-r--r-- 1 root root 312 Aug 1 1994 welcome.msg '226 Transfer complete.' >>> ftps.quit() '221 Goodbye.' >>> ''' ssl_version = ssl.PROTOCOL_SSLv23 def __init__(self, host='', user='', passwd='', acct='', keyfile=None, certfile=None, context=None, timeout=_GLOBAL_DEFAULT_TIMEOUT, source_address=None): if context is not None and keyfile is not None: raise ValueError("context and keyfile arguments are mutually " "exclusive") if context is not None and certfile is not None: raise ValueError("context and certfile arguments are mutually " "exclusive") self.keyfile = keyfile self.certfile = certfile if context is None: context = ssl._create_stdlib_context(self.ssl_version, certfile=certfile, keyfile=keyfile) self.context = context self._prot_p = False FTP.__init__(self, host, user, passwd, acct, timeout, source_address) def login(self, user='', passwd='', acct='', secure=True): if secure and not isinstance(self.sock, ssl.SSLSocket): self.auth() return FTP.login(self, user, passwd, acct) def auth(self): '''Set up secure control connection by using TLS/SSL.''' if isinstance(self.sock, ssl.SSLSocket): raise ValueError("Already using TLS") if self.ssl_version >= ssl.PROTOCOL_SSLv23: resp = self.voidcmd('AUTH TLS') else: resp = self.voidcmd('AUTH SSL') self.sock = self.context.wrap_socket(self.sock, server_hostname=self.host) self.file = self.sock.makefile(mode='r', encoding=self.encoding) return resp def ccc(self): '''Switch back to a clear-text control connection.''' if not isinstance(self.sock, ssl.SSLSocket): raise ValueError("not using TLS") resp = self.voidcmd('CCC') self.sock = self.sock.unwrap() return resp def prot_p(self): '''Set up secure data connection.''' # PROT defines whether or not the data channel is to be protected. # Though RFC-2228 defines four possible protection levels, # RFC-4217 only recommends two, Clear and Private. # Clear (PROT C) means that no security is to be used on the # data-channel, Private (PROT P) means that the data-channel # should be protected by TLS. # PBSZ command MUST still be issued, but must have a parameter of # '0' to indicate that no buffering is taking place and the data # connection should not be encapsulated. self.voidcmd('PBSZ 0') resp = self.voidcmd('PROT P') self._prot_p = True return resp def prot_c(self): '''Set up clear text data connection.''' resp = self.voidcmd('PROT C') self._prot_p = False return resp # --- Overridden FTP methods def ntransfercmd(self, cmd, rest=None): conn, size = FTP.ntransfercmd(self, cmd, rest) if self._prot_p: conn = self.context.wrap_socket(conn, server_hostname=self.host) return conn, size def abort(self): # overridden as we can't pass MSG_OOB flag to sendall() line = b'ABOR' + B_CRLF self.sock.sendall(line) resp = self.getmultiline() if resp[:3] not in {'426', '225', '226'}: raise error_proto(resp) return resp __all__.append('FTP_TLS') all_errors = (Error, OSError, EOFError, ssl.SSLError) _150_re = None def parse150(resp): '''Parse the '150' response for a RETR request. Returns the expected transfer size or None; size is not guaranteed to be present in the 150 message. ''' if resp[:3] != '150': raise error_reply(resp) global _150_re if _150_re is None: import re _150_re = re.compile( "150 .* \((\d+) bytes\)", re.IGNORECASE | re.ASCII) m = _150_re.match(resp) if not m: return None return int(m.group(1)) _227_re = None def parse227(resp): '''Parse the '227' response for a PASV request. Raises error_proto if it does not contain '(h1,h2,h3,h4,p1,p2)' Return ('host.addr.as.numbers', port#) tuple.''' if resp[:3] != '227': raise error_reply(resp) global _227_re if _227_re is None: import re _227_re = re.compile(r'(\d+),(\d+),(\d+),(\d+),(\d+),(\d+)', re.ASCII) m = _227_re.search(resp) if not m: raise error_proto(resp) numbers = m.groups() host = '.'.join(numbers[:4]) port = (int(numbers[4]) << 8) + int(numbers[5]) return host, port def parse229(resp, peer): '''Parse the '229' response for an EPSV request. Raises error_proto if it does not contain '(|||port|)' Return ('host.addr.as.numbers', port#) tuple.''' if resp[:3] != '229': raise error_reply(resp) left = resp.find('(') if left < 0: raise error_proto(resp) right = resp.find(')', left + 1) if right < 0: raise error_proto(resp) # should contain '(|||port|)' if resp[left + 1] != resp[right - 1]: raise error_proto(resp) parts = resp[left + 1:right].split(resp[left+1]) if len(parts) != 5: raise error_proto(resp) host = peer[0] port = int(parts[3]) return host, port def parse257(resp): '''Parse the '257' response for a MKD or PWD request. This is a response to a MKD or PWD request: a directory name. Returns the directoryname in the 257 reply.''' if resp[:3] != '257': raise error_reply(resp) if resp[3:5] != ' "': return '' # Not compliant to RFC 959, but UNIX ftpd does this dirname = '' i = 5 n = len(resp) while i < n: c = resp[i] i = i+1 if c == '"': if i >= n or resp[i] != '"': break i = i+1 dirname = dirname + c return dirname def print_line(line): '''Default retrlines callback to print a line.''' print(line) def ftpcp(source, sourcename, target, targetname = '', type = 'I'): '''Copy file from one FTP-instance to another.''' if not targetname: targetname = sourcename type = 'TYPE ' + type source.voidcmd(type) target.voidcmd(type) sourcehost, sourceport = parse227(source.sendcmd('PASV')) target.sendport(sourcehost, sourceport) # RFC 959: the user must "listen" [...] BEFORE sending the # transfer request. # So: STOR before RETR, because here the target is a "user". treply = target.sendcmd('STOR ' + targetname) if treply[:3] not in {'125', '150'}: raise error_proto # RFC 959 sreply = source.sendcmd('RETR ' + sourcename) if sreply[:3] not in {'125', '150'}: raise error_proto # RFC 959 source.voidresp() target.voidresp() class Netrc: """Class to parse & provide access to 'netrc' format files. See the netrc(4) man page for information on the file format. WARNING: This class is obsolete -- use module netrc instead. """ __defuser = None __defpasswd = None __defacct = None def __init__(self, filename=None): warnings.warn("This class is deprecated, use the netrc module instead", DeprecationWarning, 2) if filename is None: if "HOME" in os.environ: filename = os.path.join(os.environ["HOME"], ".netrc") else: raise OSError("specify file to load or set $HOME") self.__hosts = {} self.__macros = {} fp = open(filename, "r") in_macro = 0 while 1: line = fp.readline() if not line: break if in_macro and line.strip(): macro_lines.append(line) continue elif in_macro: self.__macros[macro_name] = tuple(macro_lines) in_macro = 0 words = line.split() host = user = passwd = acct = None default = 0 i = 0 while i < len(words): w1 = words[i] if i+1 < len(words): w2 = words[i + 1] else: w2 = None if w1 == 'default': default = 1 elif w1 == 'machine' and w2: host = w2.lower() i = i + 1 elif w1 == 'login' and w2: user = w2 i = i + 1 elif w1 == 'password' and w2: passwd = w2 i = i + 1 elif w1 == 'account' and w2: acct = w2 i = i + 1 elif w1 == 'macdef' and w2: macro_name = w2 macro_lines = [] in_macro = 1 break i = i + 1 if default: self.__defuser = user or self.__defuser self.__defpasswd = passwd or self.__defpasswd self.__defacct = acct or self.__defacct if host: if host in self.__hosts: ouser, opasswd, oacct = \ self.__hosts[host] user = user or ouser passwd = passwd or opasswd acct = acct or oacct self.__hosts[host] = user, passwd, acct fp.close() def get_hosts(self): """Return a list of hosts mentioned in the .netrc file.""" return self.__hosts.keys() def get_account(self, host): """Returns login information for the named host. The return value is a triple containing userid, password, and the accounting field. """ host = host.lower() user = passwd = acct = None if host in self.__hosts: user, passwd, acct = self.__hosts[host] user = user or self.__defuser passwd = passwd or self.__defpasswd acct = acct or self.__defacct return user, passwd, acct def get_macros(self): """Return a list of all defined macro names.""" return self.__macros.keys() def get_macro(self, macro): """Return a sequence of lines which define a named macro.""" return self.__macros[macro] def test(): '''Test program. Usage: ftp [-d] [-r[file]] host [-l[dir]] [-d[dir]] [-p] [file] ... -d dir -l list -p password ''' if len(sys.argv) < 2: print(test.__doc__) sys.exit(0) debugging = 0 rcfile = None while sys.argv[1] == '-d': debugging = debugging+1 del sys.argv[1] if sys.argv[1][:2] == '-r': # get name of alternate ~/.netrc file: rcfile = sys.argv[1][2:] del sys.argv[1] host = sys.argv[1] ftp = FTP(host) ftp.set_debuglevel(debugging) userid = passwd = acct = '' try: netrc = Netrc(rcfile) except OSError: if rcfile is not None: sys.stderr.write("Could not open account file" " -- using anonymous login.") else: try: userid, passwd, acct = netrc.get_account(host) except KeyError: # no account for host sys.stderr.write( "No account -- using anonymous login.") ftp.login(userid, passwd, acct) for file in sys.argv[2:]: if file[:2] == '-l': ftp.dir(file[2:]) elif file[:2] == '-d': cmd = 'CWD' if file[2:]: cmd = cmd + ' ' + file[2:] resp = ftp.sendcmd(cmd) elif file == '-p': ftp.set_pasv(not ftp.passiveserver) else: ftp.retrbinary('RETR ' + file, \ sys.stdout.write, 1024) ftp.quit() if __name__ == '__main__': test()