python字符串操作和string模块代码分析 牛人总结 转存
原文链接: http://blog.chinaunix.net/uid-25992400-id-3283846.html
    任何语言都离不开字符,那就会涉及对字符的操作,尤其是脚本语言更是频繁,不管是生产环境还是面试考验都要面对字符串的操作。
 可以将这些方法按功能用途划分为以下几种类型:
 
 
 
 
    python的字符串操作通过2部分的方法函数基本上就可以解决所有的字符串操作需求:
- python的字符串属性函数
- python的string模块
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- 字符串属性函数
     系统版本:CentOS release 6.2 (Final)2.6.32-220.el6.x86_64
     python版本:Python 2.6.6
字符串属性方法
- >>> str='string learn'
- >>> dir(str)
- ['__add__', '__class__', '__contains__', '__delattr__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getitem__', '__getnewargs__', '__getslice__', '__gt__', '__hash__', '__init__', '__le__', '__len__', '__lt__', '__mod__', '__mul__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__rmod__', '__rmul__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_formatter_field_name_split', '_formatter_parser', 'capitalize', 'center', 'count', 'decode', 'encode', 'endswith', 'expandtabs', 'find', 'format', 'index', 'isalnum', 'isalpha', 'isdigit', 'islower', 'isspace', 'istitle', 'isupper', 'join', 'ljust', 'lower','lstrip', 'partition', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'rstrip','split', 'splitlines', 'startswith', 'strip', 'swapcase', 'title', 'translate', 'upper', 'zfill']
字符串格式输出对齐
- >>> str='stRINg lEArn'
- >>>
- >>> str.center(20) #生成20个字符长度,str排中间
- ' stRINg lEArn '
- >>>
- >>> str.ljust(20) #str左对齐
- 'stRINg lEArn '
- >>>
- >>> str.rjust(20) #str右对齐
- ' stRINg lEArn'
- >>>
- >>> str.zfill(20) #str右对齐,左边填充0
- '00000000stRINg lEArn'
大小写转换
- >>> str='stRINg lEArn'
- >>>
- >>> str.upper() #转大写
- 'STRING LEARN'
- >>>
- >>> str.lower() #转小写
- 'string learn'
- >>>
- >>> str.capitalize() #字符串首为大写,其余小写
- 'String learn'
- >>>
- >>> str.swapcase() #大小写对换
- 'STrinG LeaRN'
- >>>
- >>> str.title() #以分隔符为标记,首字符为大写,其余为小写
- 'String Learn'
字符串条件判断
- >>> str='0123'
- >>> str.isalnum() #是否全是字母和数字,并至少有一个字符
- True
- >>> str.isdigit() #是否全是数字,并至少有一个字符
- True
- >>> str='abcd'
- >>> str.isalnum()
- True
- >>> str.isalpha() #是否全是字母,并至少有一个字符
- True
- >>> str.islower() #是否全是小写,当全是小写和数字一起时候,也判断为True
- True
- >>> str='abcd0123'
- >>> str.islower() #同上
- True
- >>> str.isalnum()
- True
- >>> str=' '
- >>> str.isspace() #是否全是空白字符,并至少有一个字符
- True
- >>> str='ABC'
- >>> str.isupper() #是否全是大写,当全是大写和数字一起时候,也判断为True
- True
- >>> str='Abb Acc'
- >>> str.istitle() #所有单词字首都是大写,标题
- True
- >>> str='string learn'
- >>> str.startswith('str') #判断字符串以'str'开头
- True
- >>> str.endswith('arn') #判读字符串以'arn'结尾
- True
字符串搜索定位与替换
- >>> str='string lEARn'
- >>>
- >>> str.find('a') #查找字符串,没有则返回-1,有则返回查到到第一个匹配的索引
- -1
- >>> str.find('n')
- 4
- >>> str.rfind('n') #同上,只是返回的索引是最后一次匹配的
- 11
- >>>
- >>> str.index('a') #如果没有匹配则报错
- Traceback (most recent call last):
- File "", line 1, in <module>
- ValueError: substring not found
- >>> str.index('n') #同find类似,返回第一次匹配的索引值
- 4
- >>> str.rindex('n') #返回最后一次匹配的索引值
- 11
- >>>
- >>> str.count('a') #字符串中匹配的次数
- 0
- >>> str.count('n') #同上
- 2
- >>>
- >>> str.replace('EAR','ear') #匹配替换
- 'string learn'
- >>> str.replace('n','N')
- 'striNg lEARN'
- >>> str.replace('n','N',1)
- 'striNg lEARn'
- >>>
- >>>
- >>> str.strip('n') #删除字符串首尾匹配的字符,通常用于默认删除回车符
- 'string lEAR'
- >>> str.lstrip('n') #左匹配
- 'string lEARn'
- >>> str.rstrip('n') #右匹配
- 'string lEAR'
- >>>
- >>> str=' tab'
- >>> str.expandtabs() #把制表符转为空格
- ' tab'
- >>> str.expandtabs(2) #指定空格数
- ' tab'
字符串编码与解码
- >>> str='字符串学习'
- >>> str
- '\xe5\xad\x97\xe7\xac\xa6\xe4\xb8\xb2\xe5\xad\xa6\xe4\xb9\xa0'
- >>>
- >>> str.decode('utf-8') #解码过程,将utf-8解码为unicode
- u'\u5b57\u7b26\u4e32\u5b66\u4e60'
- >>> str.decode('utf-8').encode('gbk') #编码过程,将unicode编码为gbk
- '\xd7\xd6\xb7\xfb\xb4\xae\xd1\xa7\xcf\xb0'
- >>> str.decode('utf-8').encode('utf-8') #将unicode编码为utf-8
- '\xe5\xad\x97\xe7\xac\xa6\xe4\xb8\xb2\xe5\xad\xa6\xe4\xb9\xa0'
字符串分割变换
- >>> str='Learn string'
- >>> '-'.join(str)
- 'L-e-a-r-n- -s-t-r-i-n-g'
- >>> l1=['Learn','string']
- >>> '-'.join(l1)
- 'Learn-string'
- >>>
- >>> str.split('n')
- ['Lear', ' stri', 'g']
- >>> str.split('n',1)
- ['Lear', ' string']
- >>> str.rsplit('n',1)
- ['Learn stri', 'g']
- >>>
- >>> str.splitlines()
- ['Learn string']
- >>>
- >>> str.partition('n')
- ('Lear', 'n', ' string')
- >>> str.rpartition('n')
- ('Learn stri', 'n', 'g')
string模块源代码
- """A collection of string operations (most are no longer used).
- Warning: most of the code you see here isn't normally used nowadays.
- Beginning with Python 1.6, many of these functions are implemented as
- methods on the standard string object. They used to be implemented by
- a built-in module called strop, but strop is now obsolete itself.
- Public module variables:
- whitespace -- a string containing all characters considered whitespace
- lowercase -- a string containing all characters considered lowercase letters
- uppercase -- a string containing all characters considered uppercase letters
- letters -- a string containing all characters considered letters
- digits -- a string containing all characters considered decimal digits
- hexdigits -- a string containing all characters considered hexadecimal digits
- octdigits -- a string containing all characters considered octal digits
- punctuation -- a string containing all characters considered punctuation
- printable -- a string containing all characters considered printable
- """
- # Some strings for ctype-style character classification
- whitespace = ' \t\n\r\v\f'
- lowercase = 'abcdefghijklmnopqrstuvwxyz'
- uppercase = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
- letters = lowercase + uppercase
- ascii_lowercase = lowercase
- ascii_uppercase = uppercase
- ascii_letters = ascii_lowercase + ascii_uppercase
- digits = '0123456789'
- hexdigits = digits + 'abcdef' + 'ABCDEF'
- octdigits = '01234567'
- punctuation = """!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""
- printable = digits + letters + punctuation + whitespace
- # Case conversion helpers
- # Use str to convert Unicode literal in case of -U
- l = map(chr, xrange(256))
- _idmap = str('').join(l)
- del l
- # Functions which aren't available as string methods.
- # Capitalize the words in a string, e.g. " aBc dEf " -> "Abc Def".
- def capwords(s, sep=None):
- """capwords(s [,sep]) -> string
- Split the argument into words using split, capitalize each
- word using capitalize, and join the capitalized words using
- join. If the optional second argument sep is absent or None,
- runs of whitespace characters are replaced by a single space
- and leading and trailing whitespace are removed, otherwise
- sep is used to split and join the words.
- """
- return (sep or ' ').join(x.capitalize() for x in s.split(sep))
- # Construct a translation string
- _idmapL = None
- def maketrans(fromstr, tostr):
- """maketrans(frm, to) -> string
- Return a translation table (a string of 256 bytes long)
- suitable for use in string.translate. The strings frm and to
- must be of the same length.
- """
- if len(fromstr) != len(tostr):
- raise ValueError, "maketrans arguments must have same length"
- global _idmapL
- if not _idmapL:
- _idmapL = list(_idmap)
- L = _idmapL[:]
- fromstr = map(ord, fromstr)
- for i in range(len(fromstr)):
- L[fromstr[i]] = tostr[i]
- return ''.join(L)
- ####################################################################
- import re as _re
- class _multimap:
- """Helper class for combining multiple mappings.
- Used by .{safe_,}substitute() to combine the mapping and keyword
- arguments.
- """
- def __init__(self, primary, secondary):
- self._primary = primary
- self._secondary = secondary
- def __getitem__(self, key):
- try:
- return self._primary[key]
- except KeyError:
- return self._secondary[key]
- class _TemplateMetaclass(type):
- pattern = r"""
- %(delim)s(?:
- (?P%(delim)s) | # Escape sequence of two delimiters
- (?P%(id)s) | # delimiter and a Python identifier
- {(?P%(id)s)} | # delimiter and a braced identifier
- (?P) # Other ill-formed delimiter exprs
- )
- """
- def __init__(cls, name, bases, dct):
- super(_TemplateMetaclass, cls).__init__(name, bases, dct)
- if 'pattern' in dct:
- pattern = cls.pattern
- else:
- pattern = _TemplateMetaclass.pattern % {
- 'delim' : _re.escape(cls.delimiter),
- 'id' : cls.idpattern,
- }
- cls.pattern = _re.compile(pattern, _re.IGNORECASE | _re.VERBOSE)
- class Template:
- """A string class for supporting $-substitutions."""
- __metaclass__ = _TemplateMetaclass
- delimiter = '$'
- idpattern = r'[_a-z][_a-z0-9]*'
- def __init__(self, template):
- self.template = template
- # Search for $$, $identifier, ${identifier}, and any bare $'s
- def _invalid(self, mo):
- i = mo.start('invalid')
- lines = self.template[:i].splitlines(True)
- if not lines:
- colno = 1
- lineno = 1
- else:
- colno = i - len(''.join(lines[:-1]))
- lineno = len(lines)
- raise ValueError('Invalid placeholder in string: line %d, col %d' %
- (lineno, colno))
- def substitute(self, *args, **kws):
- if len(args) > 1:
- raise TypeError('Too many positional arguments')
- if not args:
- mapping = kws
- elif kws:
- mapping = _multimap(kws, args[0])
- else:
- mapping = args[0]
- # Helper function for .sub()
- def convert(mo):
- # Check the most common path first.
- named = mo.group('named') or mo.group('braced')
- if named is not None:
- val = mapping[named]
- # We use this idiom instead of str() because the latter will
- # fail if val is a Unicode containing non-ASCII characters.
- return '%s' % (val,)
- if mo.group('escaped') is not None:
- return self.delimiter
- if mo.group('invalid') is not None:
- self._invalid(mo)
- raise ValueError('Unrecognized named group in pattern',
- self.pattern)
- return self.pattern.sub(convert, self.template)
- def safe_substitute(self, *args, **kws):
- if len(args) > 1:
- raise TypeError('Too many positional arguments')
- if not args:
- mapping = kws
- elif kws:
- mapping = _multimap(kws, args[0])
- else:
- mapping = args[0]
- # Helper function for .sub()
- def convert(mo):
- named = mo.group('named')
- if named is not None:
- try:
- # We use this idiom instead of str() because the latter
- # will fail if val is a Unicode containing non-ASCII
- return '%s' % (mapping[named],)
- except KeyError:
- return self.delimiter + named
- braced = mo.group('braced')
- if braced is not None:
- try:
- return '%s' % (mapping[braced],)
- except KeyError:
- return self.delimiter + '{' + braced + '}'
- if mo.group('escaped') is not None:
- return self.delimiter
- if mo.group('invalid') is not None:
- return self.delimiter
- raise ValueError('Unrecognized named group in pattern',
- self.pattern)
- return self.pattern.sub(convert, self.template)
- ####################################################################
- # NOTE: Everything below here is deprecated. Use string methods instead.
- # This stuff will go away in Python 3.0.
- # Backward compatible names for exceptions
- index_error = ValueError
- atoi_error = ValueError
- atof_error = ValueError
- atol_error = ValueError
- # convert UPPER CASE letters to lower case
- def lower(s):
- """lower(s) -> string
- Return a copy of the string s converted to lowercase.
- """
- return s.lower()
- # Convert lower case letters to UPPER CASE
- def upper(s):
- """upper(s) -> string
- Return a copy of the string s converted to uppercase.
- """
- return s.upper()
- # Swap lower case letters and UPPER CASE
- def swapcase(s):
- """swapcase(s) -> string
- Return a copy of the string s with upper case characters
- converted to lowercase and vice versa.
- """
- return s.swapcase()
- # Strip leading and trailing tabs and spaces
- def strip(s, chars=None):
- """strip(s [,chars]) -> string
- Return a copy of the string s with leading and trailing
- whitespace removed.
- If chars is given and not None, remove characters in chars instead.
- If chars is unicode, S will be converted to unicode before stripping.
- """
- return s.strip(chars)
- # Strip leading tabs and spaces
- def lstrip(s, chars=None):
- """lstrip(s [,chars]) -> string
- Return a copy of the string s with leading whitespace removed.
- If chars is given and not None, remove characters in chars instead.
- """
- return s.lstrip(chars)
- # Strip trailing tabs and spaces
- def rstrip(s, chars=None):
- """rstrip(s [,chars]) -> string
- Return a copy of the string s with trailing whitespace removed.
- If chars is given and not None, remove characters in chars instead.
- """
- return s.rstrip(chars)
- # Split a string into a list of space/tab-separated words
- def split(s, sep=None, maxsplit=-1):
- """split(s [,sep [,maxsplit]]) -> list of strings
- Return a list of the words in the string s, using sep as the
- delimiter string. If maxsplit is given, splits at no more than
- maxsplit places (resulting in at most maxsplit+1 words). If sep
- is not specified or is None, any whitespace string is a separator.
- (split and splitfields are synonymous)
- """
- return s.split(sep, maxsplit)
- splitfields = split
- # Split a string into a list of space/tab-separated words
- def rsplit(s, sep=None, maxsplit=-1):
- """rsplit(s [,sep [,maxsplit]]) -> list of strings
- Return a list of the words in the string s, using sep as the
- delimiter string, starting at the end of the string and working
- to the front. If maxsplit is given, at most maxsplit splits are
- done. If sep is not specified or is None, any whitespace string
- is a separator.
- """
- return s.rsplit(sep, maxsplit)
- # Join fields with optional separator
- def join(words, sep = ' '):
- """join(list [,sep]) -> string
- Return a string composed of the words in list, with
- intervening occurrences of sep. The default separator is a
- single space.
- (joinfields and join are synonymous)
- """
- return sep.join(words)
- joinfields = join
- # Find substring, raise exception if not found
- def index(s, *args):
- """index(s, sub [,start [,end]]) -> int
- Like find but raises ValueError when the substring is not found.
- """
- return s.index(*args)
- # Find last substring, raise exception if not found
- def rindex(s, *args):
- """rindex(s, sub [,start [,end]]) -> int
- Like rfind but raises ValueError when the substring is not found.
- """
- return s.rindex(*args)
- # Count non-overlapping occurrences of substring
- def count(s, *args):
- """count(s, sub[, start[,end]]) -> int
- Return the number of occurrences of substring sub in string
- s[start:end]. Optional arguments start and end are
- interpreted as in slice notation.
- """
- return s.count(*args)
- # Find substring, return -1 if not found
- def find(s, *args):
- """find(s, sub [,start [,end]]) -> in
- Return the lowest index in s where substring sub is found,
- such that sub is contained within s[start,end]. Optional
- arguments start and end are interpreted as in slice notation.
- Return -1 on failure.
- """
- return s.find(*args)
- # Find last substring, return -1 if not found
- def rfind(s, *args):
- """rfind(s, sub [,start [,end]]) -> int
- Return the highest index in s where substring sub is found,
- such that sub is contained within s[start,end]. Optional
- arguments start and end are interpreted as in slice notation.
- Return -1 on failure.
- """
- return s.rfind(*args)
- # for a bit of speed
- _float = float
- _int = int
- _long = long
- # Convert string to float
- def atof(s):
- """atof(s) -> float
- Return the floating point number represented by the string s.
- """
- return _float(s)
- # Convert string to integer
- def atoi(s , base=10):
- """atoi(s [,base]) -> int
- Return the integer represented by the string s in the given
- base, which defaults to 10. The string s must consist of one
- or more digits, possibly preceded by a sign. If base is 0, it
- is chosen from the leading characters of s, 0 for octal, 0x or
- 0X for hexadecimal. If base is 16, a preceding 0x or 0X is
- accepted.
- """
- return _int(s, base)
- # Convert string to long integer
- def atol(s, base=10):
- """atol(s [,base]) -> long
- Return the long integer represented by the string s in the
- given base, which defaults to 10. The string s must consist
- of one or more digits, possibly preceded by a sign. If base
- is 0, it is chosen from the leading characters of s, 0 for
- octal, 0x or 0X for hexadecimal. If base is 16, a preceding
- 0x or 0X is accepted. A trailing L or l is not accepted,
- unless base is 0.
- """
- return _long(s, base)
- # Left-justify a string
- def ljust(s, width, *args):
- """ljust(s, width[, fillchar]) -> string
- Return a left-justified version of s, in a field of the
- specified width, padded with spaces as needed. The string is
- never truncated. If specified the fillchar is used instead of spaces.
- """
- return s.ljust(width, *args)
- # Right-justify a string
- def rjust(s, width, *args):
- """rjust(s, width[, fillchar]) -> string
- Return a right-justified version of s, in a field of the
- specified width, padded with spaces as needed. The string is
- never truncated. If specified the fillchar is used instead of spaces.
- """
- return s.rjust(width, *args)
- # Center a string
- def center(s, width, *args):
- """center(s, width[, fillchar]) -> string
- Return a center version of s, in a field of the specified
- width. padded with spaces as needed. The string is never
- truncated. If specified the fillchar is used instead of spaces.
- """
- return s.center(width, *args)
- # Zero-fill a number, e.g., (12, 3) --> '012' and (-3, 3) --> '-03'
- # Decadent feature: the argument may be a string or a number
- # (Use of this is deprecated; it should be a string as with ljust c.s.)
- def zfill(x, width):
- """zfill(x, width) -> string
- Pad a numeric string x with zeros on the left, to fill a field
- of the specified width. The string x is never truncated.
- """
- if not isinstance(x, basestring):
- x = repr(x)
- return x.zfill(width)
- # Expand tabs in a string.
- # Doesn't take non-printing chars into account, but does understand \n.
- def expandtabs(s, tabsize=8):
- """expandtabs(s [,tabsize]) -> string
- Return a copy of the string s with all tab characters replaced
- by the appropriate number of spaces, depending on the current
- column, and the tabsize (default 8).
- """
- return s.expandtabs(tabsize)
- # Character translation through look-up table.
- def translate(s, table, deletions=""):
- """translate(s,table [,deletions]) -> string
- Return a copy of the string s, where all characters occurring
- in the optional argument deletions are removed, and the
- remaining characters have been mapped through the given
- translation table, which must be a string of length 256. The
- deletions argument is not allowed for Unicode strings.
- """
- if deletions or table is None:
- return s.translate(table, deletions)
- else:
- # Add s[:0] so that if s is Unicode and table is an 8-bit string,
- # table is converted to Unicode. This means that table *cannot*
- # be a dictionary -- for that feature, use u.translate() directly.
- return s.translate(table + s[:0])
- # Capitalize a string, e.g. "aBc dEf" -> "Abc def".
- def capitalize(s):
- """capitalize(s) -> string
- Return a copy of the string s with only its first character
- capitalized.
- """
- return s.capitalize()
- # Substring replacement (global)
- def replace(s, old, new, maxsplit=-1):
- """replace (str, old, new[, maxsplit]) -> string
- Return a copy of string str with all occurrences of substring
- old replaced by new. If the optional argument maxsplit is
- given, only the first maxsplit occurrences are replaced.
- """
- return s.replace(old, new, maxsplit)
- # Try importing optional built-in module "strop" -- if it exists,
- # it redefines some string operations that are 100-1000 times faster.
- # It also defines values for whitespace, lowercase and uppercase
- # that match <ctype.h>'s definitions.
- try:
- from strop import maketrans, lowercase, uppercase, whitespace
- letters = lowercase + uppercase
- except ImportError:
- pass # Use the original versions
- ########################################################################
- # the Formatter class
- # see PEP 3101 for details and purpose of this class
- # The hard parts are reused from the C implementation. They're exposed as "_"
- # prefixed methods of str and unicode.
- # The overall parser is implemented in str._formatter_parser.
- # The field name parser is implemented in str._formatter_field_name_split
- class Formatter(object):
- def format(self, format_string, *args, **kwargs):
- return self.vformat(format_string, args, kwargs)
- def vformat(self, format_string, args, kwargs):
- used_args = set()
- result = self._vformat(format_string, args, kwargs, used_args, 2)
- self.check_unused_args(used_args, args, kwargs)
- return result
- def _vformat(self, format_string, args, kwargs, used_args, recursion_depth):
- if recursion_depth < 0:
- raise ValueError('Max string recursion exceeded')
- result = []
- for literal_text, field_name, format_spec, conversion in \
- self.parse(format_string):
- # output the literal text
- if literal_text:
- result.append(literal_text)
- # if there's a field, output it
- if field_name is not None:
- # this is some markup, find the object and do
- # the formatting
- # given the field_name, find the object it references
- # and the argument it came from
- obj, arg_used = self.get_field(field_name, args, kwargs)
- used_args.add(arg_used)
- # do any conversion on the resulting object
- obj = self.convert_field(obj, conversion)
- # expand the format spec, if needed
- format_spec = self._vformat(format_spec, args, kwargs,
- used_args, recursion_depth-1)
- # format the object and append to the result
- result.append(self.format_field(obj, format_spec))
- return ''.join(result)
- def get_value(self, key, args, kwargs):
- if isinstance(key, (int, long)):
- return args[key]
- else:
- return kwargs[key]
- def check_unused_args(self, used_args, args, kwargs):
- pass
- def format_field(self, value, format_spec):
- return format(value, format_spec)
- def convert_field(self, value, conversion):
- # do any conversion on the resulting object
- if conversion == 'r':
- return repr(value)
- elif conversion == 's':
- return str(value)
- elif conversion is None:
- return value
- raise ValueError("Unknown converion specifier {0!s}".format(conversion))
- # returns an iterable that contains tuples of the form:
- # (literal_text, field_name, format_spec, conversion)
- # literal_text can be zero length
- # field_name can be None, in which case there's no
- # object to format and output
- # if field_name is not None, it is looked up, formatted
- # with format_spec and conversion and then used
- def parse(self, format_string):
- return format_string._formatter_parser()
- # given a field_name, find the object it references.
- # field_name: the field being looked up, e.g. "0.name"
- # or "lookup[3]"
- # used_args: a set of which args have been used
- # args, kwargs: as passed in to vformat
- def get_field(self, field_name, args, kwargs):
- first, rest = field_name._formatter_field_name_split()
- obj = self.get_value(first, args, kwargs)
- # loop through the rest of the field_name, doing
- # getattr or getitem as needed
- for is_attr, i in rest:
- if is_attr:
- obj = getattr(obj, i)
- else:
- obj = obj[i]
- return obj, first
 
                    
                     
                    
                 
                    
                
 
                
            
         
         浙公网安备 33010602011771号
浙公网安备 33010602011771号