2

继上文: PHP使用protobuf

虽然php能序列化和反序列化,但是奈何头条不认啊,最后使用了python脚本的形式,去序列化,但很快就暴露出了问题,速度太慢!几万个设备号要序列化2小时+,当然主要的原因在于当时赶时间,是一个个设备号序列化的,大量的时间花在python上下文切换上,上文里的脚本能用,但是不适合稍微量大一点的场景,故而用三脚猫的功夫写了一个新的python脚本,接受文件,吐出序列化后的新文件,速度大大提升,实测大概1000/s个设备号。

from __future__ import print_function
import DmpDataProtoV2_pb2
import os,sys
import time
import base64


ag_len = sys.argv.__len__()
if ag_len <= 1:
    print ('ag is null')
    exit()
file = sys.argv[1]
if not file.strip():
    print ('files is null')
    exit()
if not os.path.exists(file):
    print ('files is not exists')
    exit()
f = open(file)

line = f.readline()
line=line.strip('\n')
base_name = os.path.splitext(file)[0]
target_file = base_name + '-ProtoBuf.txt'
print(target_file)
# if os.path.exists(target_file)::
#     os.remove(target_file)
t = open(target_file, 'w')
t.truncate()
while line:
    line=line.strip('\n')
    if not line.strip():
        continue
    arr = line.split('|')
    if arr.__len__() != 2:
        continue
    dmp_data  = DmpDataProtoV2_pb2.DmpData()
    id_item1  = dmp_data.idList.add()
    dtype     = arr[0]
    dev_id    = arr[1]
    id_item1.dataType = getattr(DmpDataProtoV2_pb2.IdItem,dtype)
    #id_item1.dataType = DmpDataProtoV2_pb2.IdItem.IDFA
    id_item1.id = str.lower(dev_id)
    id_item1.tags.append(dtype)
    # id_item1.timestamp = int(time.time())

    binary_string  = dmp_data.SerializeToString()
    s = base64.b64encode(binary_string)
    t.write(s+"\n");
    line = f.readline()
    line=line.strip('\n')
f.close()

PHP调用部分

//从py重写
$protobuf_path = shell_exec("python ".base_path()."/scripts/python/base64DmpItemByFile.py {$file_path}");

Done!

DmpDataProtoV2_pb2.py

# Generated by the protocol buffer compiler.  DO NOT EDIT!
# source: DmpDataProtoV2.proto

import sys
_b=sys.version_info[0]<3 and (lambda x:x) or (lambda x:x.encode('latin1'))
from google.protobuf import descriptor as _descriptor
from google.protobuf import message as _message
from google.protobuf import reflection as _reflection
from google.protobuf import symbol_database as _symbol_database
from google.protobuf import descriptor_pb2
# @@protoc_insertion_point(imports)

_sym_db = _symbol_database.Default()




DESCRIPTOR = _descriptor.FileDescriptor(
  name='DmpDataProtoV2.proto',
  package='toutiao.dmp',
  serialized_pb=_b('\n\x14\x44mpDataProtoV2.proto\x12\x0btoutiao.dmp\".\n\x07\x44mpData\x12#\n\x06idList\x18\x01 \x03(\x0b\x32\x13.toutiao.dmp.IdItem\"\xda\x01\n\x06IdItem\x12\x11\n\ttimestamp\x18\x01 \x01(\r\x12.\n\x08\x64\x61taType\x18\x02 \x02(\x0e\x32\x1c.toutiao.dmp.IdItem.DataType\x12\n\n\x02id\x18\x03 \x02(\t\x12\x0c\n\x04tags\x18\x04 \x03(\t\"s\n\x08\x44\x61taType\x12\x08\n\x04IMEI\x10\x00\x12\x08\n\x04IDFA\x10\x01\x12\x07\n\x03UID\x10\x02\x12\x0c\n\x08IMEI_MD5\x10\x04\x12\x0c\n\x08IDFA_MD5\x10\x05\x12\x16\n\x12MOBILE_HASH_SHA256\x10\x06\x12\x08\n\x04OAID\x10\x07\x12\x0c\n\x08OAID_MD5\x10\x08\x42\x0e\x42\x0c\x44mpDataProto')
)
_sym_db.RegisterFileDescriptor(DESCRIPTOR)



_IDITEM_DATATYPE = _descriptor.EnumDescriptor(
  name='DataType',
  full_name='toutiao.dmp.IdItem.DataType',
  filename=None,
  file=DESCRIPTOR,
  values=[
    _descriptor.EnumValueDescriptor(
      name='IMEI', index=0, number=0,
      options=None,
      type=None),
    _descriptor.EnumValueDescriptor(
      name='IDFA', index=1, number=1,
      options=None,
      type=None),
    _descriptor.EnumValueDescriptor(
      name='UID', index=2, number=2,
      options=None,
      type=None),
    _descriptor.EnumValueDescriptor(
      name='IMEI_MD5', index=3, number=4,
      options=None,
      type=None),
    _descriptor.EnumValueDescriptor(
      name='IDFA_MD5', index=4, number=5,
      options=None,
      type=None),
    _descriptor.EnumValueDescriptor(
      name='MOBILE_HASH_SHA256', index=5, number=6,
      options=None,
      type=None),
    _descriptor.EnumValueDescriptor(
      name='OAID', index=6, number=7,
      options=None,
      type=None),
    _descriptor.EnumValueDescriptor(
      name='OAID_MD5', index=7, number=8,
      options=None,
      type=None),
  ],
  containing_type=None,
  options=None,
  serialized_start=189,
  serialized_end=304,
)
_sym_db.RegisterEnumDescriptor(_IDITEM_DATATYPE)


_DMPDATA = _descriptor.Descriptor(
  name='DmpData',
  full_name='toutiao.dmp.DmpData',
  filename=None,
  file=DESCRIPTOR,
  containing_type=None,
  fields=[
    _descriptor.FieldDescriptor(
      name='idList', full_name='toutiao.dmp.DmpData.idList', index=0,
      number=1, type=11, cpp_type=10, label=3,
      has_default_value=False, default_value=[],
      message_type=None, enum_type=None, containing_type=None,
      is_extension=False, extension_scope=None,
      options=None),
  ],
  extensions=[
  ],
  nested_types=[],
  enum_types=[
  ],
  options=None,
  is_extendable=False,
  extension_ranges=[],
  oneofs=[
  ],
  serialized_start=37,
  serialized_end=83,
)


_IDITEM = _descriptor.Descriptor(
  name='IdItem',
  full_name='toutiao.dmp.IdItem',
  filename=None,
  file=DESCRIPTOR,
  containing_type=None,
  fields=[
    _descriptor.FieldDescriptor(
      name='timestamp', full_name='toutiao.dmp.IdItem.timestamp', index=0,
      number=1, type=13, cpp_type=3, label=1,
      has_default_value=False, default_value=0,
      message_type=None, enum_type=None, containing_type=None,
      is_extension=False, extension_scope=None,
      options=None),
    _descriptor.FieldDescriptor(
      name='dataType', full_name='toutiao.dmp.IdItem.dataType', index=1,
      number=2, type=14, cpp_type=8, label=2,
      has_default_value=False, default_value=0,
      message_type=None, enum_type=None, containing_type=None,
      is_extension=False, extension_scope=None,
      options=None),
    _descriptor.FieldDescriptor(
      name='id', full_name='toutiao.dmp.IdItem.id', index=2,
      number=3, type=9, cpp_type=9, label=2,
      has_default_value=False, default_value=_b("").decode('utf-8'),
      message_type=None, enum_type=None, containing_type=None,
      is_extension=False, extension_scope=None,
      options=None),
    _descriptor.FieldDescriptor(
      name='tags', full_name='toutiao.dmp.IdItem.tags', index=3,
      number=4, type=9, cpp_type=9, label=3,
      has_default_value=False, default_value=[],
      message_type=None, enum_type=None, containing_type=None,
      is_extension=False, extension_scope=None,
      options=None),
  ],
  extensions=[
  ],
  nested_types=[],
  enum_types=[
    _IDITEM_DATATYPE,
  ],
  options=None,
  is_extendable=False,
  extension_ranges=[],
  oneofs=[
  ],
  serialized_start=86,
  serialized_end=304,
)

_DMPDATA.fields_by_name['idList'].message_type = _IDITEM
_IDITEM.fields_by_name['dataType'].enum_type = _IDITEM_DATATYPE
_IDITEM_DATATYPE.containing_type = _IDITEM
DESCRIPTOR.message_types_by_name['DmpData'] = _DMPDATA
DESCRIPTOR.message_types_by_name['IdItem'] = _IDITEM

DmpData = _reflection.GeneratedProtocolMessageType('DmpData', (_message.Message,), dict(
  DESCRIPTOR = _DMPDATA,
  __module__ = 'DmpDataProtoV2_pb2'
  # @@protoc_insertion_point(class_scope:toutiao.dmp.DmpData)
  ))
_sym_db.RegisterMessage(DmpData)

IdItem = _reflection.GeneratedProtocolMessageType('IdItem', (_message.Message,), dict(
  DESCRIPTOR = _IDITEM,
  __module__ = 'DmpDataProtoV2_pb2'
  # @@protoc_insertion_point(class_scope:toutiao.dmp.IdItem)
  ))
_sym_db.RegisterMessage(IdItem)


DESCRIPTOR.has_options = True
DESCRIPTOR._options = _descriptor._ParseOptions(descriptor_pb2.FileOptions(), _b('B\014DmpDataProto'))
# @@protoc_insertion_point(module_scope)

tfzh
231 声望17 粉丝

code what u love & love what u code