Implementation of string in lua

Data structure definition

#define CommonHeader	struct GCObject *next; lu_byte tt; lu_byte marked

typedef struct GCObject {
  CommonHeader;
} GCObject;

typedef union Value {
  struct GCObject *gc;    /* collectable objects */
  void *p;         /* light userdata */
  lua_CFunction f; /* light C functions */
  lua_Integer i;   /* integer numbers */
  lua_Number n;    /* float numbers */
} Value;

typedef struct TValue {
  TValuefields;
} TValue;

//character string
typedef struct TString {
  CommonHeader;      //gc
  lu_byte extra;     //Flag indicating whether the long string has been hash ed
  lu_byte shrlen;    //Length of short string
  unsigned int hash; //hash value
  union {
    size_t lnglen;   //Length of long string
    struct TString *hnext;  //next pointer to short string
  } u; //The long string indicates the length, and the short string indicates the next pointer
  char contents[1]; //Text content. The specific size depends on the string
} TString;

The CommonHeader macro is the first member of the string. The reason for this is that it is consistent with the memory of the GCObject structure. It temporarily ignores lua's gc operation.

Design of lua string

  • lua virtual machine has a global hash table to store short strings (regular strings are not stored in the hash table)
  • The string is read-only. In lua language, all modifications to the string will regenerate the string
a = "1"
a = "2"

Why design a read-only string?
It can be analyzed from the following two points

String comparison and search: what is the traditional string comparison? First compare the length. If the length is equal, enter the cycle to compare character by character. So what happens after designing a read-only string? All strings are unique in memory, and the addresses of different strings are different. Therefore, if you want to judge whether the strings are the same, just compare whether the addresses are the same. Put the string into the hash table, and you can find the corresponding hash slot according to the hash value for traversal. Therefore, this design can improve the efficiency of comparison and search

Space saving: in lua, variables that store the same string point to the same memory, which saves space for creating a copy of the string

Long and short string

#define LUA_TSTRING		4
//Long and short macro definition
#define LUA_VSHRSTR 	 makevariant(LUA_TSTRING, 0) / / short string
#define LUA_VLNGSTR 	 makevariant(LUA_TSTRING, 1) / / long string
//Type judgment
#define ttisstring(o)		checktype((o), LUA_TSTRING)
#define ttisshrstring(o)	checktag((o), ctb(LUA_VSHRSTR))
#define ttislngstring(o)	checktag((o), ctb(LUA_VLNGSTR))
//Long short boundary
#define LUAI_MAXSHORTLEN	40
//Get value
#define getstr(ts)  ((ts)->contents)

The string type in lua is LUA_TSTRING, if lua is separated according to this value_ Vshrstr and LUA_VLNGSTR, similar to the previous number.

The difference between long and short strings lies in luai_ Maxshellen is a macro. If it is greater than this, it is a long string, and if it is less than or equal to this, it is a short string

String creation

Storage location

#define STRCACHE_N		53
#define STRCACHE_M		2

typedef struct stringtable {
  TString **hash; //Pointer array
  int nuse;  //Size of hash slot
  int size;  //Number of string stores
} stringtable;

typedef struct global_State {
  ...
  stringtable strt;  //String hash table
  ...
  TString *strcache[STRCACHE_N][STRCACHE_M];  //String cache
  ...
} global_State;

strt is where the string is ultimately stored
strcache makes a simple cache

Initialize hash table

LUA_API lua_State *lua_newstate (lua_Alloc f, void *ud) {
  ...
  
  //Empty hash table
  g->strt.size = g->strt.nuse = 0;
  g->strt.hash = NULL;
  
  ...
}

#define MINSTRTABSIZE	128

void luaS_init (lua_State *L) {
  global_State *g = G(L);
  int i, j;
  stringtable *tb = &G(L)->strt;
  //Create a hash slot with the size of minsrttabsize
  tb->hash = luaM_newvector(L, MINSTRTABSIZE, TString*);
  tablerehash(tb->hash, 0, MINSTRTABSIZE);  //Initialize hash slot
  tb->size = MINSTRTABSIZE;
  
  //Initialize cache string
  g->memerrmsg = luaS_newliteral(L, MEMERRMSG);
  luaC_fix(L, obj2gco(g->memerrmsg));  /* it should never be collected */
  for (i = 0; i < STRCACHE_N; i++)  /* fill cache with valid strings */
    for (j = 0; j < STRCACHE_M; j++)
      g->strcache[i][j] = g->memerrmsg;
}

cache

#define point2uint(p)	((unsigned int)((size_t)(p) & UINT_MAX))

TString *luaS_new (lua_State *L, const char *str) {
  unsigned int i = point2uint(str) % STRCACHE_N;  //hash with address
  int j;
  TString **p = G(L)->strcache[i];		//Find the corresponding slot according to the hash value
  for (j = 0; j < STRCACHE_M; j++) {    //Find cache string
    if (strcmp(str, getstr(p[j])) == 0) //If there is a cache, the string has been created
      return p[j];
  }
  
  for (j = STRCACHE_M - 1; j > 0; j--)  //Shifting out of a cache location may empty the last cache
    p[j] = p[j - 1];
  //Create a new string
  p[0] = luaS_newlstr(L, str, strlen(str));
  return p[0];
}

You can see that the code makes a simple caching mechanism with the address as the hash value.
Parameter 1: mainly used to find storage location (hash table)
Parameter 2: create string

Create string entry

TString *luaS_newlstr (lua_State *L, const char *str, size_t l) {
  if (l <= LUAI_MAXSHORTLEN)
    return internshrstr(L, str, l); //Short string
  else {
  	//Long string
    TString *ts;
    if (l_unlikely(l >= (MAX_SIZE - sizeof(TString))/sizeof(char)))
      luaM_toobig(L);
    ts = luaS_createlngstrobj(L, l);
    memcpy(getstr(ts), str, l * sizeof(char));
    return ts;
  }
}

Parameter 1: find storage location (hash table)
Parameter 2: String
Parameter 3: string length

You can see through the length l and luai_ Maxshellen to compare the size to distinguish between long and short strings.

Creation of short string

//hash function 
unsigned int luaS_hash (const char *str, size_t l, unsigned int seed) {
  unsigned int h = seed ^ cast_uint(l);
  for (; l > 0; l--)
    h ^= ((h<<5) + (h>>2) + cast_byte(str[l - 1]));
  return h;
}

static TString *internshrstr (lua_State *L, const char *str, size_t l) {
  TString *ts;
  global_State *g = G(L);
  stringtable *tb = &g->strt;	//hash table
  unsigned int h = luaS_hash(str, l, g->seed);//Calculate hash value
  TString **list = &tb->hash[lmod(h, tb->size)];//Find the corresponding slot according to the hash value of the string
  lua_assert(str != NULL);  /* otherwise 'memcmp'/'memcpy' are undefined */
  //Double check operation, if any, indicates that this string has been created before, it will be returned directly
  for (ts = *list; ts != NULL; ts = ts->u.hnext) {
    if (l == ts->shrlen && (memcmp(str, getstr(ts), l * sizeof(char)) == 0)) {
      /* found! */
      if (isdead(g, ts))  /* dead (but not collected yet)? */
        changewhite(ts);  /* resurrect it */
      return ts;
    }
  }
  //The number of strings and the size of the hash table slot determine whether to rehash
  if (tb->nuse >= tb->size) {  /* need to grow string table? */
    growstrtab(L, tb);
    list = &tb->hash[lmod(h, tb->size)];  //rehash completes the new slot of the current string
  }
  ts = createstrobj(L, l, LUA_VSHRSTR, h);
  //String assignment
  memcpy(getstr(ts), str, l * sizeof(char));
  ts->shrlen = cast_byte(l);//String size
  //Add linked list (header insertion)
  ts->u.hnext = *list;
  *list = ts;
  tb->nuse++;//Number of strings + 1
  return ts;
}

This is the normal operation of hash table insertion

  • Calculate hash value
  • Find the slot according to the hash value
  • Traverse the current slot to check the duplicate, and return directly if there are duplicates
  • Finally, the assignment is to insert the hash table

A linked list is used to resolve hash conflicts in lua strings

rehash of short string

static void tablerehash (TString **vect, int osize, int nsize) {
  int i;
  for (i = osize; i < nsize; i++)  //Initialize new slot
    vect[i] = NULL;
  for (i = 0; i < osize; i++) {    //Traverse the old slot for rehash
    TString *p = vect[i];		  //Save current slot
    vect[i] = NULL;                //Empty the current slot
    while (p) {  //Traverse current slot
      TString *hnext = p->u.hnext; //Save next node location
	  
      unsigned int h = lmod(p->hash, nsize);  //Recalculate hash slot
	  //Insert linked list (header)
      p->u.hnext = vect[h];
      vect[h] = p;
	  
      p = hnext;				  //Location of the next node previously saved
    }
  }
}

void luaS_resize (lua_State *L, int nsize) {
  stringtable *tb = &G(L)->strt; //hash table
  int osize = tb->size; //oldsize
  TString **newvect;
  if (nsize < osize)  //Volume reduction
    tablerehash(tb->hash, osize, nsize);
  //realloc
  newvect = luaM_reallocvector(L, tb->hash, osize, nsize, TString*);
  if (l_unlikely(newvect == NULL)) {  //realloc failed
    if (nsize < osize)
      tablerehash(tb->hash, nsize, osize);//If the volume has been shrunk before, restore the previous hash table
  }
  else {
    //Update hash table
    tb->hash = newvect;
    tb->size = nsize;
	
    if (nsize > osize) //Capacity expansion
      tablerehash(newvect, osize, nsize);
  }
}

The code is also relatively simple. Judge the expansion and reduction of capacity through oldsize and newsize. The key function is tablerehash, and the operation is relatively simple and routine

  • Traversal hash table
  • Get a new slot according to the hash value, and then insert it

Creation of long strings

TString *luaS_createlngstrobj (lua_State *L, size_t l) {
  TString *ts = createstrobj(L, l, LUA_VLNGSTR, G(L)->seed);
  ts->u.lnglen = l;
  return ts;
}

TString *luaS_newlstr (lua_State *L, const char *str, size_t l) {
  if (l <= LUAI_MAXSHORTLEN)  /* short string? */
    return internshrstr(L, str, l);
  else {
    TString *ts;
    if (l_unlikely(l >= (MAX_SIZE - sizeof(TString))/sizeof(char)))
      luaM_toobig(L);
    ts = luaS_createlngstrobj(L, l);
	//assignment
    memcpy(getstr(ts), str, l * sizeof(char));
    return ts;
  }
}

The creation of long strings is not so complicated. After the creation, the assignment is completed

Deletion of string

void luaS_remove (lua_State *L, TString *ts) {
  stringtable *tb = &G(L)->strt;//hash table
  TString **p = &tb->hash[lmod(ts->hash, tb->size)];//Slot corresponding to this string
  while (*p != ts)  //Find ts
    p = &(*p)->u.hnext;
  *p = (*p)->u.hnext;	//Let the next pointer of the previous node point to the next node
  tb->nuse--;
}

Long string not deleted

  • Traverse the corresponding slot to find the string to be deleted
  • Let the next pointer of the previous node point to the next node

String comparison

//Comparison of short strings
#define eqshrstr(a,b)	check_exp((a)->tt == LUA_VSHRSTR, (a) == (b))
//Comparison of long strings
int luaS_eqlngstr (TString *a, TString *b) {
  size_t len = a->u.lnglen;
  lua_assert(a->tt == LUA_VLNGSTR && b->tt == LUA_VLNGSTR);
  return (a == b) ||  /* same instance or... */
    ((len == b->u.lnglen) &&  /* equal length and ... */
     (memcmp(getstr(a), getstr(b), len) == 0));  /* equal contents */
}

Short string comparison direct comparison address
Long string comparisons take precedence over address comparisons, followed by regular string comparisons

reference resources

< Lua design and implementation > codedump

Based on lua5 4.3 source code

Keywords: lua

Added by Stripy42 on Mon, 31 Jan 2022 16:30:38 +0200