Data structure definition
#define CommonHeader struct GCObject *next; lu_byte tt; lu_byte marked typedef struct GCObject { CommonHeader; } GCObject; typedef union Value { struct GCObject *gc; /* collectable objects */ void *p; /* light userdata */ lua_CFunction f; /* light C functions */ lua_Integer i; /* integer numbers */ lua_Number n; /* float numbers */ } Value; typedef struct TValue { TValuefields; } TValue; //character string typedef struct TString { CommonHeader; //gc lu_byte extra; //Flag indicating whether the long string has been hash ed lu_byte shrlen; //Length of short string unsigned int hash; //hash value union { size_t lnglen; //Length of long string struct TString *hnext; //next pointer to short string } u; //The long string indicates the length, and the short string indicates the next pointer char contents[1]; //Text content. The specific size depends on the string } TString;
The CommonHeader macro is the first member of the string. The reason for this is that it is consistent with the memory of the GCObject structure. It temporarily ignores lua's gc operation.
Design of lua string
- lua virtual machine has a global hash table to store short strings (regular strings are not stored in the hash table)
- The string is read-only. In lua language, all modifications to the string will regenerate the string
a = "1" a = "2"
Why design a read-only string?
It can be analyzed from the following two points
String comparison and search: what is the traditional string comparison? First compare the length. If the length is equal, enter the cycle to compare character by character. So what happens after designing a read-only string? All strings are unique in memory, and the addresses of different strings are different. Therefore, if you want to judge whether the strings are the same, just compare whether the addresses are the same. Put the string into the hash table, and you can find the corresponding hash slot according to the hash value for traversal. Therefore, this design can improve the efficiency of comparison and search
Space saving: in lua, variables that store the same string point to the same memory, which saves space for creating a copy of the string
Long and short string
#define LUA_TSTRING 4 //Long and short macro definition #define LUA_VSHRSTR makevariant(LUA_TSTRING, 0) / / short string #define LUA_VLNGSTR makevariant(LUA_TSTRING, 1) / / long string //Type judgment #define ttisstring(o) checktype((o), LUA_TSTRING) #define ttisshrstring(o) checktag((o), ctb(LUA_VSHRSTR)) #define ttislngstring(o) checktag((o), ctb(LUA_VLNGSTR)) //Long short boundary #define LUAI_MAXSHORTLEN 40 //Get value #define getstr(ts) ((ts)->contents)
The string type in lua is LUA_TSTRING, if lua is separated according to this value_ Vshrstr and LUA_VLNGSTR, similar to the previous number.
The difference between long and short strings lies in luai_ Maxshellen is a macro. If it is greater than this, it is a long string, and if it is less than or equal to this, it is a short string
String creation
Storage location
#define STRCACHE_N 53 #define STRCACHE_M 2 typedef struct stringtable { TString **hash; //Pointer array int nuse; //Size of hash slot int size; //Number of string stores } stringtable; typedef struct global_State { ... stringtable strt; //String hash table ... TString *strcache[STRCACHE_N][STRCACHE_M]; //String cache ... } global_State;
strt is where the string is ultimately stored
strcache makes a simple cache
Initialize hash table
LUA_API lua_State *lua_newstate (lua_Alloc f, void *ud) { ... //Empty hash table g->strt.size = g->strt.nuse = 0; g->strt.hash = NULL; ... } #define MINSTRTABSIZE 128 void luaS_init (lua_State *L) { global_State *g = G(L); int i, j; stringtable *tb = &G(L)->strt; //Create a hash slot with the size of minsrttabsize tb->hash = luaM_newvector(L, MINSTRTABSIZE, TString*); tablerehash(tb->hash, 0, MINSTRTABSIZE); //Initialize hash slot tb->size = MINSTRTABSIZE; //Initialize cache string g->memerrmsg = luaS_newliteral(L, MEMERRMSG); luaC_fix(L, obj2gco(g->memerrmsg)); /* it should never be collected */ for (i = 0; i < STRCACHE_N; i++) /* fill cache with valid strings */ for (j = 0; j < STRCACHE_M; j++) g->strcache[i][j] = g->memerrmsg; }
cache
#define point2uint(p) ((unsigned int)((size_t)(p) & UINT_MAX)) TString *luaS_new (lua_State *L, const char *str) { unsigned int i = point2uint(str) % STRCACHE_N; //hash with address int j; TString **p = G(L)->strcache[i]; //Find the corresponding slot according to the hash value for (j = 0; j < STRCACHE_M; j++) { //Find cache string if (strcmp(str, getstr(p[j])) == 0) //If there is a cache, the string has been created return p[j]; } for (j = STRCACHE_M - 1; j > 0; j--) //Shifting out of a cache location may empty the last cache p[j] = p[j - 1]; //Create a new string p[0] = luaS_newlstr(L, str, strlen(str)); return p[0]; }
You can see that the code makes a simple caching mechanism with the address as the hash value.
Parameter 1: mainly used to find storage location (hash table)
Parameter 2: create string
Create string entry
TString *luaS_newlstr (lua_State *L, const char *str, size_t l) { if (l <= LUAI_MAXSHORTLEN) return internshrstr(L, str, l); //Short string else { //Long string TString *ts; if (l_unlikely(l >= (MAX_SIZE - sizeof(TString))/sizeof(char))) luaM_toobig(L); ts = luaS_createlngstrobj(L, l); memcpy(getstr(ts), str, l * sizeof(char)); return ts; } }
Parameter 1: find storage location (hash table)
Parameter 2: String
Parameter 3: string length
You can see through the length l and luai_ Maxshellen to compare the size to distinguish between long and short strings.
Creation of short string
//hash function unsigned int luaS_hash (const char *str, size_t l, unsigned int seed) { unsigned int h = seed ^ cast_uint(l); for (; l > 0; l--) h ^= ((h<<5) + (h>>2) + cast_byte(str[l - 1])); return h; } static TString *internshrstr (lua_State *L, const char *str, size_t l) { TString *ts; global_State *g = G(L); stringtable *tb = &g->strt; //hash table unsigned int h = luaS_hash(str, l, g->seed);//Calculate hash value TString **list = &tb->hash[lmod(h, tb->size)];//Find the corresponding slot according to the hash value of the string lua_assert(str != NULL); /* otherwise 'memcmp'/'memcpy' are undefined */ //Double check operation, if any, indicates that this string has been created before, it will be returned directly for (ts = *list; ts != NULL; ts = ts->u.hnext) { if (l == ts->shrlen && (memcmp(str, getstr(ts), l * sizeof(char)) == 0)) { /* found! */ if (isdead(g, ts)) /* dead (but not collected yet)? */ changewhite(ts); /* resurrect it */ return ts; } } //The number of strings and the size of the hash table slot determine whether to rehash if (tb->nuse >= tb->size) { /* need to grow string table? */ growstrtab(L, tb); list = &tb->hash[lmod(h, tb->size)]; //rehash completes the new slot of the current string } ts = createstrobj(L, l, LUA_VSHRSTR, h); //String assignment memcpy(getstr(ts), str, l * sizeof(char)); ts->shrlen = cast_byte(l);//String size //Add linked list (header insertion) ts->u.hnext = *list; *list = ts; tb->nuse++;//Number of strings + 1 return ts; }
This is the normal operation of hash table insertion
- Calculate hash value
- Find the slot according to the hash value
- Traverse the current slot to check the duplicate, and return directly if there are duplicates
- Finally, the assignment is to insert the hash table
A linked list is used to resolve hash conflicts in lua strings
rehash of short string
static void tablerehash (TString **vect, int osize, int nsize) { int i; for (i = osize; i < nsize; i++) //Initialize new slot vect[i] = NULL; for (i = 0; i < osize; i++) { //Traverse the old slot for rehash TString *p = vect[i]; //Save current slot vect[i] = NULL; //Empty the current slot while (p) { //Traverse current slot TString *hnext = p->u.hnext; //Save next node location unsigned int h = lmod(p->hash, nsize); //Recalculate hash slot //Insert linked list (header) p->u.hnext = vect[h]; vect[h] = p; p = hnext; //Location of the next node previously saved } } } void luaS_resize (lua_State *L, int nsize) { stringtable *tb = &G(L)->strt; //hash table int osize = tb->size; //oldsize TString **newvect; if (nsize < osize) //Volume reduction tablerehash(tb->hash, osize, nsize); //realloc newvect = luaM_reallocvector(L, tb->hash, osize, nsize, TString*); if (l_unlikely(newvect == NULL)) { //realloc failed if (nsize < osize) tablerehash(tb->hash, nsize, osize);//If the volume has been shrunk before, restore the previous hash table } else { //Update hash table tb->hash = newvect; tb->size = nsize; if (nsize > osize) //Capacity expansion tablerehash(newvect, osize, nsize); } }
The code is also relatively simple. Judge the expansion and reduction of capacity through oldsize and newsize. The key function is tablerehash, and the operation is relatively simple and routine
- Traversal hash table
- Get a new slot according to the hash value, and then insert it
Creation of long strings
TString *luaS_createlngstrobj (lua_State *L, size_t l) { TString *ts = createstrobj(L, l, LUA_VLNGSTR, G(L)->seed); ts->u.lnglen = l; return ts; } TString *luaS_newlstr (lua_State *L, const char *str, size_t l) { if (l <= LUAI_MAXSHORTLEN) /* short string? */ return internshrstr(L, str, l); else { TString *ts; if (l_unlikely(l >= (MAX_SIZE - sizeof(TString))/sizeof(char))) luaM_toobig(L); ts = luaS_createlngstrobj(L, l); //assignment memcpy(getstr(ts), str, l * sizeof(char)); return ts; } }
The creation of long strings is not so complicated. After the creation, the assignment is completed
Deletion of string
void luaS_remove (lua_State *L, TString *ts) { stringtable *tb = &G(L)->strt;//hash table TString **p = &tb->hash[lmod(ts->hash, tb->size)];//Slot corresponding to this string while (*p != ts) //Find ts p = &(*p)->u.hnext; *p = (*p)->u.hnext; //Let the next pointer of the previous node point to the next node tb->nuse--; }
Long string not deleted
- Traverse the corresponding slot to find the string to be deleted
- Let the next pointer of the previous node point to the next node
String comparison
//Comparison of short strings #define eqshrstr(a,b) check_exp((a)->tt == LUA_VSHRSTR, (a) == (b)) //Comparison of long strings int luaS_eqlngstr (TString *a, TString *b) { size_t len = a->u.lnglen; lua_assert(a->tt == LUA_VLNGSTR && b->tt == LUA_VLNGSTR); return (a == b) || /* same instance or... */ ((len == b->u.lnglen) && /* equal length and ... */ (memcmp(getstr(a), getstr(b), len) == 0)); /* equal contents */ }
Short string comparison direct comparison address
Long string comparisons take precedence over address comparisons, followed by regular string comparisons
reference resources
< Lua design and implementation > codedump
Based on lua5 4.3 source code