Data structure - data compression algorithm based on Huffman tree

Refer to Yan Weimin's textbook < data structure > > 2nd Edition

describe

Input a string of strings, build corresponding Huffman tree according to the frequency of characters in the given string, and construct Huffman coding table. On this basis, the compressed file can be compressed (i.e. encoded), and the compressed binary encoding file can be decompressed (i.e. decoded).

input

Multiple groups of data, one row for each group of data, is a string (only 26 lowercase letters are considered). When the input string is "0", the input ends.

output

Output 2n+3 lines for each group of data (n is the number of character categories in the input string). The first line is the occurrence frequency of the characters (only the existing characters are output in the format of character: frequency). Each two groups of characters are separated by a space, and the characters are arranged in the order of ASCII code from small to large. The second line to 2n is the final state of the storage structure of Huffman tree (as shown in table 5.2 (b) on page 139 of the textbook), and the data in one line is separated by spaces). 2n+1 is the Huffman code of each character (only the existing characters are output, the format is: character: Code). Each two groups of characters are separated by a space, and the characters are arranged in the order of ASCII code from small to large. The 2n+2 line is the encoded string, and the 2n+3 line is the decoded string (the same as the input string)

sample input

aaaaaaabbbbbccdddd
aabccc
0

sample output

a:7 b:5 c:2 d:4
1 7 7 0 0
2 5 6 0 0
3 2 5 0 0
4 4 5 0 0
5 6 6 3 4
6 11 7 2 5
7 18 0 1 6
a:0 b:10 c:110 d:111
00000001010101010110110111111111111
aaaaaaabbbbbccdddd
a:2 b:1 c:3
1 2 4 0 0
2 1 4 0 0
3 3 5 0 0
4 3 5 2 1
5 6 0 3 4
a:11 b:10 c:0
111110000
aabccc

Algorithm implementation

#include<iostream>
#include<map>
#include<string>
#include<stdio.h>
#include<memory.h>
using namespace std;

typedef struct{
  char c;
  int weight;
  int lchild,rchild,parent;
}HuffmanNode,*HuffmanTree;

typedef struct {
	char ch;
	char *pHC;
}HuffCodeNode;
typedef HuffCodeNode *HuffmanCode;

void Select(HuffmanTree &HT, int index, int &s1, int &s2)
{
	int i;
	for (i = 1; i <= index; i++)
		if (HT[i].parent == 0)
			break;
	s1 = i;
	for (int j = s1 + 1; j <= index; j++)
		if (HT[j].parent == 0 && HT[j].weight < HT[s1].weight)
			s1 = j;
	int k;
	for (k = 1; k <= index; k++)
		if (HT[k].parent == 0 && k != s1)
			break;
	s2 = k;
	for (int j = s2 + 1; j <= index; j++)
		if (HT[j].parent == 0 && HT[j].weight < HT[s2].weight&&j != s1)
			s2 = j;
}

int CreateHuffmanTree(HuffmanTree& HT, string str)
{
  map<char,int> mymap;
  for(int i=0;i<str.length();i++)
    mymap.find(str[i]) != mymap.end() ? mymap[str[i]]++ : mymap[str[i]] = 1;
  for(auto it=mymap.begin();it!=mymap.end();it++)
    cout<<it->first<<":"<<it->second<<" ";
  cout<<endl;
  int n = mymap.size();
  HT=new HuffmanNode[2*n];
  for(int i=0;i<2*n;i++)
  {
    HT[i].c='\0';
    HT[i].weight=0;
    HT[i].lchild=HT[i].rchild=0;
    HT[i].parent=0;
  }
  map<char,int>::iterator it=mymap.begin();
  for(int i=1;i<=n, it!=mymap.end();i++,it++)
  {
    HT[i].c=it->first;
    HT[i].weight=it->second;
  }
  for(int i=n+1;i<2*n;i++)
  {
    int s1,s2;
    Select(HT, i-1, s1, s2);
    HT[i].lchild=s1;HT[i].rchild=s2;
    HT[s2].parent=i;
    HT[s1].parent=i;
    HT[i].weight=HT[s1].weight+HT[s2].weight;
  }
  return n;
}
void CreateHuffmanCode(HuffmanTree HT, HuffmanCode &HC, int n)
{
	HC = new HuffCodeNode[n + 1];
	char *dc = new char[n];
	dc[n - 1] = '\0';
	int start;
	for (int i = 1; i <= n; i++)
	{
		HC[i].ch = HT[i].c;
		start = n - 1;
		int c = i, f = HT[c].parent;
		while (f)
		{
			if (HT[f].lchild == c)
				dc[--start] = '0';
			else
				dc[--start] = '1';
			c = f;
			f = HT[f].parent;
		}
		int m = n - start;
		HC[i].pHC = new char[m];
		memcpy(HC[i].pHC, dc + start, m);
	}
}

void HuffmanPrintInfo(HuffmanTree& HT, int n)
{
	for (int i = 1; i < 2 * n; i++)
	{
		cout << i <<' '<< HT[i].weight << ' ' << HT[i].parent
			<< ' ' << HT[i].lchild << ' ' << HT[i].rchild << endl;
	}
}
void HuffmanCodePrint(HuffmanCode &HC, int n)
{
	for (int i = 1; i <= n; i++)
		cout << HC[i].ch<<":"<<HC[i].pHC <<" ";
    cout<<endl;
}
string EncodeStr(HuffmanCode &HC, string str, int n)
{
  string res="";
  for(int i=0;i<str.length();i++)
  {
    for(int j=1;j<=n;j++)
    {
      if(str[i]==HC[j].ch)
      {
        res+=HC[j].pHC;
        break;
      }
    }
  }
  return res;
}
string DeCodeStr(HuffmanTree HT, int n, string code)
{
	string res = "";
	char ch;
	int start = 0;
	int i = 0;
	while ((ch=code[i++])!='\0')
	{
		if (ch == '0')
			start = HT[2 * n - 1].lchild;
		else if (ch == '1')
			start = HT[2 * n - 1].rchild;
		while (HT[start].lchild != 0)
		{
			if ((ch = code[i++]) == '\0') return res;
			if (ch == '0')
				start = HT[start].lchild;
			else if (ch == '1')
				start = HT[start].rchild;
		}
		res += HT[start].c;
	}
	return res;
}
int main()
{
	string str;
	cin >> str;
	while (str != "0")
	{
		HuffmanTree HT;
		HuffmanCode HC;
		int n = CreateHuffmanTree(HT, str);
		CreateHuffmanCode(HT, HC, n);
		HuffmanPrintInfo(HT, n);
		HuffmanCodePrint(HC, n);
		string encode = EncodeStr(HC, str, n);
		cout << encode << endl;
		string decode = DeCodeStr(HT, n, encode);
		cout << decode << endl;
		cin >> str;
	}
  return 0;
}

Keywords: ascii encoding

Added by oliverw92 on Sat, 20 Jun 2020 09:36:28 +0300