继上文: PHP使用protobuf
虽然php能序列化和反序列化,但是奈何头条不认啊,最后使用了python脚本的形式,去序列化,但很快就暴露出了问题,速度太慢!几万个设备号要序列化2小时+,当然主要的原因在于当时赶时间,是一个个设备号序列化的,大量的时间花在python上下文切换上,上文里的脚本能用,但是不适合稍微量大一点的场景,故而用三脚猫的功夫写了一个新的python脚本,接受文件,吐出序列化后的新文件,速度大大提升,实测大概1000/s个设备号。
from __future__ import print_function
import DmpDataProtoV2_pb2
import os,sys
import time
import base64
ag_len = sys.argv.__len__()
if ag_len <= 1:
print ('ag is null')
exit()
file = sys.argv[1]
if not file.strip():
print ('files is null')
exit()
if not os.path.exists(file):
print ('files is not exists')
exit()
f = open(file)
line = f.readline()
line=line.strip('\n')
base_name = os.path.splitext(file)[0]
target_file = base_name + '-ProtoBuf.txt'
print(target_file)
# if os.path.exists(target_file)::
# os.remove(target_file)
t = open(target_file, 'w')
t.truncate()
while line:
line=line.strip('\n')
if not line.strip():
continue
arr = line.split('|')
if arr.__len__() != 2:
continue
dmp_data = DmpDataProtoV2_pb2.DmpData()
id_item1 = dmp_data.idList.add()
dtype = arr[0]
dev_id = arr[1]
id_item1.dataType = getattr(DmpDataProtoV2_pb2.IdItem,dtype)
#id_item1.dataType = DmpDataProtoV2_pb2.IdItem.IDFA
id_item1.id = str.lower(dev_id)
id_item1.tags.append(dtype)
# id_item1.timestamp = int(time.time())
binary_string = dmp_data.SerializeToString()
s = base64.b64encode(binary_string)
t.write(s+"\n");
line = f.readline()
line=line.strip('\n')
f.close()
PHP调用部分
//从py重写
$protobuf_path = shell_exec("python ".base_path()."/scripts/python/base64DmpItemByFile.py {$file_path}");
Done!
DmpDataProtoV2_pb2.py
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: DmpDataProtoV2.proto
import sys
_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
from google.protobuf import descriptor as _descriptor
from google.protobuf import message as _message
from google.protobuf import reflection as _reflection
from google.protobuf import symbol_database as _symbol_database
from google.protobuf import descriptor_pb2
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor.FileDescriptor(
name='DmpDataProtoV2.proto',
package='toutiao.dmp',
serialized_pb=_b('\n\x14\x44mpDataProtoV2.proto\x12\x0btoutiao.dmp\".\n\x07\x44mpData\x12#\n\x06idList\x18\x01 \x03(\x0b\x32\x13.toutiao.dmp.IdItem\"\xda\x01\n\x06IdItem\x12\x11\n\ttimestamp\x18\x01 \x01(\r\x12.\n\x08\x64\x61taType\x18\x02 \x02(\x0e\x32\x1c.toutiao.dmp.IdItem.DataType\x12\n\n\x02id\x18\x03 \x02(\t\x12\x0c\n\x04tags\x18\x04 \x03(\t\"s\n\x08\x44\x61taType\x12\x08\n\x04IMEI\x10\x00\x12\x08\n\x04IDFA\x10\x01\x12\x07\n\x03UID\x10\x02\x12\x0c\n\x08IMEI_MD5\x10\x04\x12\x0c\n\x08IDFA_MD5\x10\x05\x12\x16\n\x12MOBILE_HASH_SHA256\x10\x06\x12\x08\n\x04OAID\x10\x07\x12\x0c\n\x08OAID_MD5\x10\x08\x42\x0e\x42\x0c\x44mpDataProto')
)
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
_IDITEM_DATATYPE = _descriptor.EnumDescriptor(
name='DataType',
full_name='toutiao.dmp.IdItem.DataType',
filename=None,
file=DESCRIPTOR,
values=[
_descriptor.EnumValueDescriptor(
name='IMEI', index=0, number=0,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='IDFA', index=1, number=1,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='UID', index=2, number=2,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='IMEI_MD5', index=3, number=4,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='IDFA_MD5', index=4, number=5,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='MOBILE_HASH_SHA256', index=5, number=6,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='OAID', index=6, number=7,
options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='OAID_MD5', index=7, number=8,
options=None,
type=None),
],
containing_type=None,
options=None,
serialized_start=189,
serialized_end=304,
)
_sym_db.RegisterEnumDescriptor(_IDITEM_DATATYPE)
_DMPDATA = _descriptor.Descriptor(
name='DmpData',
full_name='toutiao.dmp.DmpData',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='idList', full_name='toutiao.dmp.DmpData.idList', index=0,
number=1, type=11, cpp_type=10, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None),
],
extensions=[
],
nested_types=[],
enum_types=[
],
options=None,
is_extendable=False,
extension_ranges=[],
oneofs=[
],
serialized_start=37,
serialized_end=83,
)
_IDITEM = _descriptor.Descriptor(
name='IdItem',
full_name='toutiao.dmp.IdItem',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='timestamp', full_name='toutiao.dmp.IdItem.timestamp', index=0,
number=1, type=13, cpp_type=3, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='dataType', full_name='toutiao.dmp.IdItem.dataType', index=1,
number=2, type=14, cpp_type=8, label=2,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='id', full_name='toutiao.dmp.IdItem.id', index=2,
number=3, type=9, cpp_type=9, label=2,
has_default_value=False, default_value=_b("").decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None),
_descriptor.FieldDescriptor(
name='tags', full_name='toutiao.dmp.IdItem.tags', index=3,
number=4, type=9, cpp_type=9, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
options=None),
],
extensions=[
],
nested_types=[],
enum_types=[
_IDITEM_DATATYPE,
],
options=None,
is_extendable=False,
extension_ranges=[],
oneofs=[
],
serialized_start=86,
serialized_end=304,
)
_DMPDATA.fields_by_name['idList'].message_type = _IDITEM
_IDITEM.fields_by_name['dataType'].enum_type = _IDITEM_DATATYPE
_IDITEM_DATATYPE.containing_type = _IDITEM
DESCRIPTOR.message_types_by_name['DmpData'] = _DMPDATA
DESCRIPTOR.message_types_by_name['IdItem'] = _IDITEM
DmpData = _reflection.GeneratedProtocolMessageType('DmpData', (_message.Message,), dict(
DESCRIPTOR = _DMPDATA,
__module__ = 'DmpDataProtoV2_pb2'
# @@protoc_insertion_point(class_scope:toutiao.dmp.DmpData)
))
_sym_db.RegisterMessage(DmpData)
IdItem = _reflection.GeneratedProtocolMessageType('IdItem', (_message.Message,), dict(
DESCRIPTOR = _IDITEM,
__module__ = 'DmpDataProtoV2_pb2'
# @@protoc_insertion_point(class_scope:toutiao.dmp.IdItem)
))
_sym_db.RegisterMessage(IdItem)
DESCRIPTOR.has_options = True
DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('B\014DmpDataProto'))
# @@protoc_insertion_point(module_scope)
**粗体** _斜体_ [链接](http://example.com) `代码` - 列表 > 引用
。你还可以使用@
来通知其他用户。