Lib/string.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332

# module 'string' -- A collection of string operations

# Warning: most of the code you see here isn't normally used nowadays.
# At the end of this file most functions are replaced by built-in
# functions imported from built-in module "strop".

# Some strings for ctype-style character classification
whitespace = ' \t\n\r\v\f'
lowercase = 'abcdefghijklmnopqrstuvwxyz'
uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
letters = lowercase + uppercase
digits = '0123456789'
hexdigits = digits + 'abcdef' + 'ABCDEF'
octdigits = '01234567'

# Case conversion helpers
_idmap = ''
for i in range(256): _idmap = _idmap + chr(i)
_lower = _idmap[:ord('A')] + lowercase + _idmap[ord('Z')+1:]
_upper = _idmap[:ord('a')] + uppercase + _idmap[ord('z')+1:]
_swapcase = _upper[:ord('A')] + lowercase + _upper[ord('Z')+1:]
del i

# Backward compatible names for exceptions
index_error = ValueError
atoi_error = ValueError
atof_error = ValueError
atol_error = ValueError

# convert UPPER CASE letters to lower case
def lower(s):
	res = ''
	for c in s:
		res = res + _lower[ord(c)]
	return res

# Convert lower case letters to UPPER CASE
def upper(s):
	res = ''
	for c in s:
		res = res + _upper[ord(c)]
	return res

# Swap lower case letters and UPPER CASE
def swapcase(s):
	res = ''
	for c in s:
		res = res + _swapcase[ord(c)]
	return res

# Strip leading and trailing tabs and spaces
def strip(s):
	i, j = 0, len(s)
	while i < j and s[i] in whitespace: i = i+1
	while i < j and s[j-1] in whitespace: j = j-1
	return s[i:j]

# Strip leading tabs and spaces
def lstrip(s):
	i, j = 0, len(s)
	while i < j and s[i] in whitespace: i = i+1
	return s[i:j]

# Strip trailing tabs and spaces
def rstrip(s):
	i, j = 0, len(s)
	while i < j and s[j-1] in whitespace: j = j-1
	return s[i:j]


# Split a string into a list of space/tab-separated words
# NB: split(s) is NOT the same as splitfields(s, ' ')!
def split(s, sep=None, maxsplit=0):
	if sep is not None: return splitfields(s, sep, maxsplit)
	res = []
	i, n = 0, len(s)
	while i < n:
		while i < n and s[i] in whitespace: i = i+1
		if i == n: break
		j = i
		while j < n and s[j] not in whitespace: j = j+1
		res.append(s[i:j])
		i = j
	return res

# Split a list into fields separated by a given string
# NB: splitfields(s, ' ') is NOT the same as split(s)!
# splitfields(s, '') returns [s] (in analogy with split() in nawk)
def splitfields(s, sep=None, maxsplit=0):
	if sep is None: return split(s, None, maxsplit)
	res = []
	nsep = len(sep)
	if nsep == 0:
		return [s]
	ns = len(s)
	i = j = 0
	count = 0
	while j+nsep <= ns:
		if s[j:j+nsep] == sep:
			count = count + 1
			res.append(s[i:j])
			i = j = j + nsep
			if (maxsplit and (count >= maxsplit)):
			    break
			   
		else:
			j = j + 1
	res.append(s[i:])
	return res

# Join words with spaces between them
def join(words, sep = ' '):
	return joinfields(words, sep)

# Join fields with optional separator
def joinfields(words, sep = ' '):
	res = ''
	for w in words:
		res = res + (sep + w)
	return res[len(sep):]

# Find substring, raise exception if not found
def index(s, sub, i = 0, last=None):
	if last == None: last = len(s)
	res = find(s, sub, i, last)
	if res < 0:
		raise ValueError, 'substring not found in string.index'
	return res

# Find last substring, raise exception if not found
def rindex(s, sub, i = 0, last=None):
	if last == None: last = len(s)
	res = rfind(s, sub, i, last)
	if res < 0:
		raise ValueError, 'substring not found in string.index'
	return res

# Count non-overlapping occurrences of substring
def count(s, sub, i = 0):
	if i < 0: i = max(0, i + len(s))
	n = len(sub)
	m = len(s) + 1 - n
	if n == 0: return m-i
	r = 0
	while i < m:
		if sub == s[i:i+n]:
			r = r+1
			i = i+n
		else:
			i = i+1
	return r

# Find substring, return -1 if not found
def find(s, sub, i = 0, last=None):
	Slen = len(s)  # cache this value, for speed
	if last == None:
		last = Slen
	elif last < 0:
		last = max(0, last + Slen)
	elif last > Slen:
		last = Slen
	if i < 0: i = max(0, i + Slen)
	n = len(sub)
	m = last + 1 - n
	while i < m:
		if sub == s[i:i+n]: return i
		i = i+1
	return -1

# Find last substring, return -1 if not found
def rfind(s, sub, i = 0, last=None):
	Slen = len(s)  # cache this value, for speed
	if last == None:
		last = Slen
	elif last < 0:
		last = max(0, last + Slen)
	elif last > Slen:
		last = Slen
	if i < 0: i = max(0, i + Slen)
	n = len(sub)
	m = last + 1 - n
	r = -1
	while i < m:
		if sub == s[i:i+n]: r = i
		i = i+1
	return r

# Convert string to float
def atof(str):
	import regex
	sign = ''
	s = str
	if s and s[0] in '+-':
		sign = s[0]
		s = s[1:]
	if not s:
		raise ValueError, 'non-float argument to string.atof'
	while s[0] == '0' and len(s) > 1 and s[1] in digits: s = s[1:]
	if regex.match('[0-9]*\(\.[0-9]*\)?\([eE][-+]?[0-9]+\)?', s) != len(s):
		raise ValueError, 'non-float argument to string.atof'
	try:
		return float(eval(sign + s))
	except SyntaxError:
		raise ValueError, 'non-float argument to string.atof'

# Convert string to integer
def atoi(str, base=10):
	if base != 10:
		# We only get here if strop doesn't define atoi()
		raise ValueError, "this string.atoi doesn't support base != 10"
	sign = ''
	s = str
	if s and s[0] in '+-':
		sign = s[0]
		s = s[1:]
	if not s:
		raise ValueError, 'non-integer argument to string.atoi'
	while s[0] == '0' and len(s) > 1: s = s[1:]
	for c in s:
		if c not in digits:
			raise ValueError, 'non-integer argument to string.atoi'
	return eval(sign + s)

# Convert string to long integer
def atol(str, base=10):
	if base != 10:
		# We only get here if strop doesn't define atol()
		raise ValueError, "this string.atol doesn't support base != 10"
	sign = ''
	s = str
	if s and s[0] in '+-':
		sign = s[0]
		s = s[1:]
	if not s:
		raise ValueError, 'non-integer argument to string.atol'
	while s[0] == '0' and len(s) > 1: s = s[1:]
	for c in s:
		if c not in digits:
			raise ValueError, 'non-integer argument to string.atol'
	return eval(sign + s + 'L')

# Left-justify a string
def ljust(s, width):
	n = width - len(s)
	if n <= 0: return s
	return s + ' '*n

# Right-justify a string
def rjust(s, width):
	n = width - len(s)
	if n <= 0: return s
	return ' '*n + s

# Center a string
def center(s, width):
	n = width - len(s)
	if n <= 0: return s
	half = n/2
	if n%2 and width%2:
		# This ensures that center(center(s, i), j) = center(s, j)
		half = half+1
	return ' '*half +  s + ' '*(n-half)

# Zero-fill a number, e.g., (12, 3) --> '012' and (-3, 3) --> '-03'
# Decadent feature: the argument may be a string or a number
# (Use of this is deprecated; it should be a string as with ljust c.s.)
def zfill(x, width):
	if type(x) == type(''): s = x
	else: s = `x`
	n = len(s)
	if n >= width: return s
	sign = ''
	if s[0] in ('-', '+'):
		sign, s = s[0], s[1:]
	return sign + '0'*(width-n) + s

# Expand tabs in a string.
# Doesn't take non-printing chars into account, but does understand \n.
def expandtabs(s, tabsize=8):
	res = line = ''
	for c in s:
		if c == '\t':
			c = ' '*(tabsize - len(line)%tabsize)
		line = line + c
		if c == '\n':
			res = res + line
			line = ''
	return res + line

# Character translation through look-up table.
def translate(s, table, deletions=""):
	if type(table) != type('') or len(table) != 256:
	    raise TypeError, "translation table must be 256 characters long"
	res = ""
	for c in s:
		if c not in deletions:
			res = res + table[ord(c)]
	return res

# Capitalize a string, e.g. "aBc  dEf" -> "Abc  def".
def capitalize(s):
	return upper(s[:1]) + lower(s[1:])

# Capitalize the words in a string, e.g. " aBc  dEf " -> "Abc Def".
# See also regsub.capwords().
def capwords(s, sep=None):
	return join(map(capitalize, split(s, sep)), sep or ' ')

# Construct a translation string
_idmapL = None
def maketrans(fromstr, tostr):
	if len(fromstr) != len(tostr):
		raise ValueError, "maketrans arguments must have same length"
	global _idmapL
	if not _idmapL:
		_idmapL = map(None, _idmap)
	L = _idmapL[:]
	fromstr = map(ord, fromstr)
	for i in range(len(fromstr)):
		L[fromstr[i]] = tostr[i]
	return joinfields(L, "")

# Try importing optional built-in module "strop" -- if it exists,
# it redefines some string operations that are 100-1000 times faster.
# It also defines values for whitespace, lowercase and uppercase
# that match <ctype.h>'s definitions.

try:
	from strop import *
	letters = lowercase + uppercase
except ImportError:
	pass # Use the original, slow versions