Reading Serialized PHP Objects from Erlang

I started writing some Erlang recently. The vast majority of data I need to access from Erlang resides in cached, serialized php objects. Here’s what I came up with to turn a serialized php object into a sort of nested Erlang proplist thing.

  1. <?php
  2. $s = array(123, ‘hello’, 3.14, array(‘a’=>‘foo’, ‘b’=>‘bar’));
  3. ?>

This gives:

a:4:{i:0;i:123;i:1;s:5:"hello";i:2;d:3.14;i:3;a:2:{s:1:"a";s:3:"foo";s:1:"b";s:3:"bar";}}

It’s not hard to see how the (relatively undocumented) PHP serialization format works. Here’s what it becomes in Erlang:


1> php:unserialize("a:4:{i:0;i:123;i:1;s:5:\"hello\";i:2;d:3.14;i:3;a:2:{s:1:\"a\";s:3:\"foo\";s:1:\"b\";s:3:\"bar\";}}").

{[[{0,123},
{1,<<"hello">>},
{2,3.14},
{3,[{a,<<"foo">>},{b,<<"bar">>}]}]],
[]}

Here’s what it does with objects:

  1. <?php
  2. class ExampleClass {
  3.     var $id = 123;
  4.     var $name = "RJ";
  5.     var $languages = array(‘php’, ‘erlang’, ‘etc’);
  6. }
  7. $s = new ExampleClass();
  8. $ser = serialize($s);
  9. ?>


2> php:unserialize("O:12:\"ExampleClass\":3:{s:2:\"id\";i:123;s:4:\"name\";s:2:\"RJ\";s:9:\"languages\";a:3:{i:0;s:3:\"php\";i:1;s:6:\"erlang\";i:2;s:3:\"etc\";}}").
{[{class,"ExampleClass",
[{id,123},
{name,<<"RJ">>},
{languages,[{0,<<"php">>},
{1,<<"erlang">>},
{2,<<"etc">>}]}]}],
[]}

Due to a combination of PHP’s “relaxed” type system, an old database abstraction library, and munging things in and out of memcached, we sometimes end up with numeric properties, such as ‘id’, represented as strings by PHP. To mitigate this, I ended up with some nasty code that forces certain properties to a predefined type (”id” is always an int, etc..). Yuk. Anyway, here’s the Erlang module:

  1. %
  2. % Takes a serialized php object and turns it into an erlang data structure
  3. %
  4. -module(php).
  5. -author(‘Richard Jones <rj at last.fm>’).
  6. -export([unserialize/1]).
  7.  
  8. % Usage:  {Result, Leftover} = php:unserialize(…)
  9.  
  10. unserialize(S) when is_binary(S)    -> unserialize(binary_to_list(S));
  11. unserialize(S) when is_list(S)      -> takeval(S, 1).
  12.  
  13. % Internal stuff
  14.  
  15. takeval(Str, Num) ->
  16.     {Parsed, Remains} = takeval(Str, Num, []),
  17.     { lists:reverse(Parsed), Remains }.
  18.  
  19. takeval([$} | Leftover], 0, Acc)    -> {Acc, Leftover};
  20. takeval(Str, 0, Acc)                -> {Acc, Str};
  21. takeval([], 0, Acc)                 -> Acc;
  22.  
  23. takeval(Str, Num, Acc) ->
  24.     {Val, Rest} = phpval(Str),
  25.     %Lots of tracing if you enable this:
  26.     %io:format("\nState\n Str: ~s\n Num: ~w\n Acc:~w\n", [Str,Num,Acc]),
  27.     %io:format("-Val: ~w\n-Rest: ~s\n\n",[Val, Rest]),
  28.     takeval(Rest, Num-1, [Val | Acc]).
  29.  
  30. %
  31. % Parse induvidual php values.
  32. % a "phpval" here is T:val; where T is the type code for int, object, array etc..
  33. %
  34.  
  35. % Simple ones:
  36. phpval([])                      -> [];
  37. phpval([ $} | Rest ])           -> phpval(Rest);    % skip }
  38. phpval([$N,$;|Rest])            -> {null, Rest};    % null
  39. phpval([$b,$:,$1,$; | Rest])    -> {true, Rest};    % true
  40. phpval([$b,$:,$0,$; | Rest])    -> {false, Rest};   % false
  41.  
  42. % r seems to be a recursive reference to something, represented as an int.
  43. phpval([$r, $: | Rest]) ->
  44.     {RefNum, [$; | Rest1]} = string:to_integer(Rest),
  45.     {{php_ref, RefNum}, Rest1};
  46.  
  47. % int
  48. phpval([$i, $: | Rest])->
  49.     {Num, [$; | Rest1]} = string:to_integer(Rest),
  50.     {Num, Rest1};
  51.  
  52. % double / float
  53. % NB: php floats can be ints, and string:to_float doesn’t like that.
  54. phpval(X=[$d, $: | Rest]) ->
  55.     {Num, [$; | Rest1]} = case string:to_float(Rest) of
  56.                             {error, no_float} -> string:to_integer(Rest);
  57.                             {N,R} -> {N,R}
  58.     end,
  59.     {Num, Rest1};
  60.  
  61. % string
  62. phpval([$s, $: | Rest]) ->
  63.     {Len, [$: | Rest1]} =string:to_integer(Rest),
  64.     S = list_to_binary(string:sub_string(Rest1, 2, Len+1)),
  65.     {S, lists:nthtail(Len+3, Rest1)};
  66.  
  67. % array
  68. phpval([$a, $: | Rest]) ->
  69.     {NumEntries, [$:, ${ | Rest1]} =string:to_integer(Rest),
  70.     {Array, Rest2} = takeval(Rest1, NumEntries*2),
  71.     {arraytidy(Array), Rest2};
  72.  
  73. % object O:4:\"User\":53:{
  74. phpval([$O, $: | Rest]) ->
  75.     {ClassnameLen, [$: | Rest1]} =string:to_integer(Rest),
  76.     % Rest1: "classname":NumEnt:{..
  77.     Classname = string:sub_string(Rest1, 2, ClassnameLen+1),
  78.     Rest1b = lists:nthtail(ClassnameLen+3, Rest1),
  79.     {NumEntries, [$:, ${ | Rest2]} = string:to_integer(Rest1b),
  80.     {Classvals, Rest3} = takeval(Rest2, NumEntries*2),
  81.     {{class, Classname, arraytidy(Classvals)}, Rest3}.
  82.  
  83. %%
  84. %% Helpers:
  85. %%
  86.  
  87. % convert [ k1,v1,k2,v2,k3,v3 ] into [ {k1,v2}, {k2,v2}, {k3,v3} ]
  88. arraytidy(L) ->
  89.     lists:reverse(lists:foldl(fun arraytidy/2, [], L)).
  90.  
  91. arraytidy(El, [{key___partial, K} | L]) -> [{atomize(K), El} | L];
  92.  
  93. arraytidy(El, L) -> [{key___partial, El} | L].
  94.  
  95. %% Make properties or keys into atoms
  96. atomize(K) when is_binary(K) ->
  97.     atomize(binary_to_list(K));
  98. atomize(K) when is_list(K) ->
  99.     list_to_atom(string:to_lower(K));
  100. atomize(K) -> K.

Tags: ,

Saturday, September 27th, 2008 programming

9 Comments to Reading Serialized PHP Objects from Erlang

  1. Do you all use Thrift at Facebook to serialize PHP objects? It comes with built-in PHP and Erlang generators.

  2. John Wright on October 29th, 2008
  3. Erm, I don’t know exactly what facebook do, but i think i read somewhere that they do use thrift for serializing php objects, yes.

    Sadly changing our php serialization format isn’t gonna happen, hence the need for the above code.

  4. RJ on October 30th, 2008
  5. Thanks! I used this code in a gen_server that provides PHP eval access within Erlang. Under what license do you release this?

  6. Andy Skelton on January 16th, 2009
  7. I didn’t see the low-contrast message in the footer before. GPLv2 then. Thanks.

  8. Andy Skelton on January 19th, 2009
  9. Yep GPLv2 – i should make that more visible.
    I’ve also rolled a php-eval server in erlang, maybe i’ll dust it off and publish it soon :)

  10. RJ on February 5th, 2009
  11. I have a good fresh joke for you! What did the elephant say to the naked man? It’s cute, but can you pick up peanuts with it?
    ___________________________
    –/ viagara generic /–

  12. myracer on October 24th, 2009
  13. I am having a hard time deciphering the serialized string. In your example above, your array is serialized as:
    a:4:{i:0;i:123;i:1;s:5:”hello”;…

    this is confusing to me. if the syntax is i:0 means index = 0, and i:123 means the value at [0] = 123, how does unserialize know whether the i: means the index or the value?

    sorry if thats a dumb question, i can’t find any information about this syntax anywhere.

  14. emilio on November 4th, 2009
  15. sorry, I forgot to add:
    and why in the second array is there not an i:0? eg.
    a:2:{s:1:”a”;….

    based on the first portion of the code, I would have expected:
    a:2:{i:0;s:1:”a”;

    why is there no reference to index in the second array?

    thanks for any advice, and if you can point me to a reference I would be VERY grateful because I have large, complex data structure serialized (composed of many objects and data types) and its a little hairy at the moment.

  16. emilio on November 4th, 2009
  17. Sorry, for off top, i wanna tell one joke) What do you get if you cross a giant and a vampire? A BIG pain in the neck!
    ___________________________
    –/ viagera buy Illinois /–

  18. myrnacid on November 12th, 2009

Leave a comment