JSON的组成
一个合法的JSON字符串可以包含这几种元素:
1、特殊符号,如 \"{\" \"}\" 表示一个JSON Object,\"[\" \"]\" 表示一个JSON Array,\":\"用于分隔key-value,\",\" 用于分隔两个元素
2、字符串,用引号引起来
3、数字,包含0-9,浮点数带有\".\",表示符号可带有\"+\" \"-\"
4、常量有true,false,null
分词(or 词法分析)
分词的主要目的是将字符串分割成一个个合法的元素,其中每一个特殊符号都为一个元素,一个完整的字符串表达是一个元素,一个完整的数字表达等等
不难发现,特殊元素都只包含一个字符,常量,数字的表示中间均不会出现空格或其他不相关字符,因此可以轻易地用表达的特征区分,例如数字可以匹配一段连续的且所有字符都是在0-9或有\".\" \"+\", \"-\"的子序列(至于表达是否合法,稍后再做处理)。而字符串较为特殊,字符串原则上讲可以包含任何字符,但字符串也有自己的特征,以 \" 开始,又以两一个 \" 结束,虽然字符串中间也可能出现 \" 但若是合法表达,就必须使用 \'\\\' 转义 ,注意处理转义字符就好了,下面是词法分析的代码
class __Tokener:
def __init__(self, json_str):
self.__str = json_str
self.__i = 0
self.__cur_token = None
def __cur_char(self):
if self.__i < len(self.__str):
return self.__str[self.__i]
return \'\'
def __move_i(self, step=1):
if self.__i < len(self.__str):
self.__i += step
def __next_string(self):
outstr = \'\'
trans_flag = False
self.__move_i()
while self.__cur_char() != \'\':
ch = self.__cur_char()
if ch == \'\\\\\': # 注意处理转义
trans_flag = True
else:
if not trans_flag:
if ch == \'\"\':
break
else:
trans_flag = False
outstr += ch
self.__move_i()
return outstr
def __next_number(self):
expr = \'\'
while self.__cur_char().isdigit() or self.__cur_char() in (\'.\', \'+\', \'-\'):
expr += self.__cur_char()
self.__move_i()
self.__move_i(-1)
if \'.\' in expr: # 若数字表达非法,则下面的转换会抛出异常
return float(expr)
else:
return int(expr)
def __next_const(self):
outstr = \'\'
while self.__cur_char().isalpha():
outstr += self.__cur_char()
self.__move_i()
self.__move_i(-1)
if outstr in (\'true\', \'false\', \'null\'): # 如果不在之中的其他表达都是非法的
return {
\'true\': True,
\'false\': False,
\'null\': None
}[outstr]
raise Exception(\'Invalid symbol \"%s\"\' % outstr)
def next(self):
is_white_space = lambda a_char: a_char in (\'\\x20\', \'\\n\', \'\\r\', \'\\t\')
while is_white_space(self.__cur_char()):
self.__move_i()
ch = self.__cur_char()
if ch == \'\':
cur_token = None
elif ch in (\'{\', \'}\', \'[\', \']\', \',\', \':\'):
cur_token = ch
elif ch == \'\"\':
cur_token = self.__next_string()
elif ch.isalpha():
cur_token = self.__next_const()
elif ch.isdigit() or ch in (\'.\', \'-\', \'+\'):
cur_token = self.__next_number()
else:
raise Exception(\'Invalid symbol \"%s\"\' % ch)
self.__move_i()
self.__cur_token = cur_token
return cur_token is not None
def cur_token(self):
return self.__cur_token
开始解析
JSON有两种聚合表达需要不同的解析方式
1、JSON Object: 一定以 “{” 开始, “}” 结束,因此只要遇到\"{\" 就应该触发JSON Object的解析,在这过程中,可以用一个循环表示
obj = {}
while True:
next = 下一个元素
next 作为 [key],应该是一个字符串表达
next = 下一个元素
next 应该为 \":\"
next = 下一个元素
next 应该为 [value] 的表达,若 [value] 是 \"{\"或\"[\",递归触发 Object 或 Array 解析
next = 下一个元素
obj[key] = val
if next == \",\": # 构成了 \"[key]: [value],\"
continue
elif next == \"}\": # 遇到\"}\",Object到此为止
break
return obj
2、JSON Array: 和上面类似,JSON Array一定以\"[\" 开头,以 [value](值) ,(逗号) 循环,以最后一个值之后以 \"]\" 结束,值得注意的是JSON Array里面也可以嵌套 JSON Object
3、特殊情况:一个空Object,空Array的表达为 {}, [],注意下特殊处理就好
这部分的代码如下
def __json_object(self, tokener): # 解析 Json Object
obj = {}
if tokener.cur_token() != \'{\':
raise Exception(\'Json must start with \"{\"\')
while True:
tokener.next()
tk_temp = tokener.cur_token()
if tk_temp == \'}\': # 空 Object
return {}
if not isinstance(tk_temp, str):
raise Exception(\'invalid key %s\' % tk_temp)
key = tk_temp
tokener.next()
if tokener.cur_token() != \':\':
raise Exception(\'expect \":\" after \"%s\"\' % key)
tokener.next()
val = tokener.cur_token()
if val == \'[\':
val = self.__json_array(tokener) # 递归触发 Array 解析
elif val == \'{\':
val = self.__json_object(tokener) # 递归触发 Object 解析
obj[key] = val
tokener.next()
tk_split = tokener.cur_token()
if tk_split == \',\':
continue
elif tk_split == \'}\':
break
else:
if tk_split is None:
raise Exception(\'missing \"}\" at at the end of object\')
raise Exception(\'unexpected token \"%s\" at key \"%s\"\' % (tk_split, key))
return obj
def __json_array(self, tokener): # 解析 Json Array
if tokener.cur_token() != \'[\':
raise Exception(\'Json array must start with \"[\"\')
arr = []
while True:
tokener.next()
tk_temp = tokener.cur_token()
if tk_temp == \']\': # 空 Array
return []
if tk_temp == \'{\':
val = self.__json_object(tokener) # 递归触发 Object 解析
elif tk_temp == \'[\':
val = self.__json_array(tokener) # 递归触发 Array 解析
elif tk_temp in (\',\', \':\', \'}\'):
raise Exception(\'unexpected token \"%s\"\' % tk_temp)
else:
val = tk_temp
arr.append(val)
tokener.next()
tk_end = tokener.cur_token()
if tk_end == \',\':
continue
if tk_end == \']\':
break
else:
if tk_end is None:
raise Exception(\'missing \"]\" at the end of array\')
return arr
def decode(self, json_str): # 解析入口在这里
tokener = JsonDecoder.__Tokener(json_str)
if not tokener.next():
return None
first_token = tokener.cur_token()
if first_token == \'{\':
decode_val = self.__json_object(tokener)
elif first_token == \'[\':
decode_val = self.__json_array(tokener)
else:
raise Exception(\'Json must start with \"{\"\')
if tokener.next():
raise Exception(\'unexpected token \"%s\"\' % tokener.cur_token())
return decode_val
完整代码
# author: YotWei
# date: 2018-12-21 15:31
class JsonDecoder:
def __init__(self):
pass
def __json_object(self, tokener):
obj = {}
if tokener.cur_token() != \'{\':
raise Exception(\'Json must start with \"{\"\')
while True:
tokener.next()
tk_temp = tokener.cur_token()
if tk_temp == \'}\':
return {}
if not isinstance(tk_temp, str):
raise Exception(\'invalid key %s\' % tk_temp)
key = tk_temp
tokener.next()
if tokener.cur_token() != \':\':
raise Exception(\'expect \":\" after \"%s\"\' % key)
tokener.next()
val = tokener.cur_token()
if val == \'[\':
val = self.__json_array(tokener)
elif val == \'{\':
val = self.__json_object(tokener)
obj[key] = val
tokener.next()
tk_split = tokener.cur_token()
if tk_split == \',\':
continue
elif tk_split == \'}\':
break
else:
if tk_split is None:
raise Exception(\'missing \"}\" at at the end of object\')
raise Exception(\'unexpected token \"%s\" at key \"%s\"\' % (tk_split, key))
return obj
def __json_array(self, tokener):
if tokener.cur_token() != \'[\':
raise Exception(\'Json array must start with \"[\"\')
arr = []
while True:
tokener.next()
tk_temp = tokener.cur_token()
if tk_temp == \']\':
return []
if tk_temp == \'{\':
val = self.__json_object(tokener)
elif tk_temp == \'[\':
val = self.__json_array(tokener)
elif tk_temp in (\',\', \':\', \'}\'):
raise Exception(\'unexpected token \"%s\"\' % tk_temp)
else:
val = tk_temp
arr.append(val)
tokener.next()
tk_end = tokener.cur_token()
if tk_end == \',\':
continue
if tk_end == \']\':
break
else:
if tk_end is None:
raise Exception(\'missing \"]\" at the end of array\')
return arr
def decode(self, json_str):
tokener = JsonDecoder.__Tokener(json_str)
if not tokener.next():
return None
first_token = tokener.cur_token()
if first_token == \'{\':
decode_val = self.__json_object(tokener)
elif first_token == \'[\':
decode_val = self.__json_array(tokener)
else:
raise Exception(\'Json must start with \"{\"\')
if tokener.next():
raise Exception(\'unexpected token \"%s\"\' % tokener.cur_token())
return decode_val
class __Tokener: # Tokener 作为一个内部类
def __init__(self, json_str):
self.__str = json_str
self.__i = 0
self.__cur_token = None
def __cur_char(self):
if self.__i < len(self.__str):
return self.__str[self.__i]
return \'\'
def __move_i(self, step=1):
if self.__i < len(self.__str):
self.__i += step
def __next_string(self):
outstr = \'\'
trans_flag = False
self.__move_i()
while self.__cur_char() != \'\':
ch = self.__cur_char()
if ch == \'\\\\\':
trans_flag = True
else:
if not trans_flag:
if ch == \'\"\':
break
else:
trans_flag = False
outstr += ch
self.__move_i()
return outstr
def __next_number(self):
expr = \'\'
while self.__cur_char().isdigit() or self.__cur_char() in (\'.\', \'+\', \'-\'):
expr += self.__cur_char()
self.__move_i()
self.__move_i(-1)
if \'.\' in expr:
return float(expr)
else:
return int(expr)
def __next_const(self):
outstr = \'\'
while self.__cur_char().isalpha():
outstr += self.__cur_char()
self.__move_i()
self.__move_i(-1)
if outstr in (\'true\', \'false\', \'null\'):
return {
\'true\': True,
\'false\': False,
\'null\': None
}[outstr]
raise Exception(\'Invalid symbol \"%s\"\' % outstr)
def next(self):
is_white_space = lambda a_char: a_char in (\'\\x20\', \'\\n\', \'\\r\', \'\\t\')
while is_white_space(self.__cur_char()):
self.__move_i()
ch = self.__cur_char()
if ch == \'\':
cur_token = None
elif ch in (\'{\', \'}\', \'[\', \']\', \',\', \':\'):
cur_token = ch
elif ch == \'\"\':
cur_token = self.__next_string()
elif ch.isalpha():
cur_token = self.__next_const()
elif ch.isdigit() or ch in (\'.\', \'-\', \'+\'):
cur_token = self.__next_number()
else:
raise Exception(\'Invalid symbol \"%s\"\' % ch)
self.__move_i()
self.__cur_token = cur_token
return cur_token is not None
def cur_token(self):
return self.__cur_token
测试
test_cases = [
\'\'\'
[{\"first\": \"one\",\"next\": \"two\"},{\"first\": \"three\",\"next\": \"fore\"},{\"first\": \"five\",\"next\": \"six\"}]
\'\'\',
\'\'\'
[{}, {}, [[]]]
\'\'\',
\'\'\'
{\"empty\": {}, \"array_empty\": []}
\'\'\'
]
from PyJsonParser import JsonDecoder
decoder = JsonDecoder()
for case in test_cases:
print decoder.decode(case)
输出
[{\'next\': \'two\', \'first\': \'one\'}, {\'next\': \'fore\', \'first\': \'three\'}, {\'next\': \'six\', \'first\': \'five\'}]
[{}, {}, [[]]]
{\'array_empty\': [], \'empty\': {}}
(有Bug欢迎提出)
版权声明
本文仅代表作者观点,不代表百度立场。
本文系作者授权百度百家发表,未经许可,不得转载。



