/dev/kvm
I’ll show the very basic path into the KVM from the User Space. If you loaded ‘kvm’ module, you can find kvm as a device node, usually in /dev/ directory. This character device node provides the way to control Virtualization features such as checking the ability, creating VM, and destroying VM.
While kvm module loading, it registers character device as a misc device.
r = misc_register(&kvm_dev);
static struct file_operations kvm_chardev_ops = {
.unlocked_ioctl = kvm_dev_ioctl,
.compat_ioctl = kvm_dev_ioctl,
};
static struct miscdevice kvm_dev = {
KVM_MINOR,
"kvm",
&kvm_chardev_ops,
};
As you can see here, they only implemented ioctl call. It means that you can’t find anything useful with general read/write call or redirection on the command line. You always have to write an application to use kvm’s feature.
kvm_dev_ioctl function is implemented as following:
static long kvm_dev_ioctl(struct file *filp,
unsigned int ioctl, unsigned long arg)
{
long r = -EINVAL;
switch (ioctl) {
case KVM_GET_API_VERSION:
r = -EINVAL;
if (arg)
goto out;
r = KVM_API_VERSION;
break;
case KVM_CREATE_VM:
r = -EINVAL;
if (arg)
goto out;
r = kvm_dev_ioctl_create_vm();
break;
case KVM_CHECK_EXTENSION:
r = kvm_dev_ioctl_check_extension_generic(arg);
break;
case KVM_GET_VCPU_MMAP_SIZE:
r = -EINVAL;
if (arg)
goto out;
r = PAGE_SIZE; /* struct kvm_run */
#ifdef CONFIG_X86
r += PAGE_SIZE; /* pio data page */
#endif
#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET
r += PAGE_SIZE; /* coalesced mmio ring page */
#endif
break;
case KVM_TRACE_ENABLE:
case KVM_TRACE_PAUSE:
case KVM_TRACE_DISABLE:
r = kvm_trace_ioctl(ioctl, arg);
break;
default:
return kvm_arch_dev_ioctl(filp, ioctl, arg);
}
out:
return r;
}
There are 7 predefined commands. If it is not recognized by this function, it goes into kvm_arch_dev_ioctl() function which do additional check with actual hardware module such as kvm_intel and kvm_adm.
KVM_GET_API_VERSION
It returns the version of KVM. In kernel 2.6.30, it is 12 (defined in kvm.h as a KVM_API_VERSION).
KVM_CREATE_VM
KVM_CHECK_EXTENSION
KVM_GET_VCPU_MMAP_SIZE
KVM_TRACE_ENABLE
KVM_TRACE_PAUSE
KVM_TRACE_DISABLE
How kernel determind the low and high memory zone?
I thought Linux fixed highmem zone and low mem zone at the compile time. But, I was wrong. I even found the command line parameter named ‘highmem=’ which can set the highmem size.
Actually, this calculation is done during the initialization and following is the code which actually do the thing.
895/* 896 * Determine low and high memory ranges: 897 */ 898unsigned long __init find_max_low_pfn(void) 899{ 900 unsigned long max_low_pfn; 901 902 max_low_pfn = max_pfn; 903 if (max_low_pfn > MAXMEM_PFN) { 904 if (highmem_pages == -1) 905 highmem_pages = max_pfn - MAXMEM_PFN; 906 if (highmem_pages + MAXMEM_PFN < max_pfn) 907 max_pfn = MAXMEM_PFN + highmem_pages; 908 if (highmem_pages + MAXMEM_PFN > max_pfn) { 909 printk("only %luMB highmem pages available, ignoring highmem size of %uMB.\n", pages_to_mb(max_pfn - MAXMEM_PFN), pages_to_mb(highmem_pages)); 910 highmem_pages = 0; 911 } 912 max_low_pfn = MAXMEM_PFN; 913#ifndef CONFIG_HIGHMEM 914 /* Maximum memory usable is what is directly addressable */ 915 printk(KERN_WARNING "Warning only %ldMB will be used.\n", 916 MAXMEM>>20); 917 if (max_pfn > MAX_NONPAE_PFN) 918 printk(KERN_WARNING "Use a PAE enabled kernel.\n"); 919 else 920 printk(KERN_WARNING "Use a HIGHMEM enabled kernel.\n"); 921 max_pfn = MAXMEM_PFN; 922#else /* !CONFIG_HIGHMEM */ 923#ifndef CONFIG_X86_PAE 924 if (max_pfn > MAX_NONPAE_PFN) { 925 max_pfn = MAX_NONPAE_PFN; 926 printk(KERN_WARNING "Warning only 4GB will be used.\n"); 927 printk(KERN_WARNING "Use a PAE enabled kernel.\n"); 928 } 929#endif /* !CONFIG_X86_PAE */ 930#endif /* !CONFIG_HIGHMEM */ 931 } else { 932 if (highmem_pages == -1) 933 highmem_pages = 0; 934#ifdef CONFIG_HIGHMEM 935 if (highmem_pages >= max_pfn) { 936 printk(KERN_ERR "highmem size specified (%uMB) is bigger than pages available (%luMB)!.\n", pages_to_mb(highmem_pages), pages_to_mb(max_pfn)); 937 highmem_pages = 0; 938 } 939 if (highmem_pages) { 940 if (max_low_pfn-highmem_pages < 64*1024*1024/PAGE_SIZE){ 941 printk(KERN_ERR "highmem size %uMB results in smaller than 64MB lowmem, ignoring it.\n", pages_to_mb(highmem_pages)); 942 highmem_pages = 0; 943 } 944 max_low_pfn -= highmem_pages; 945 } 946#else 947 if (highmem_pages) 948 printk(KERN_ERR "ignoring highmem size on non-highmem kernel!\n"); 949#endif 950 } 951 return max_low_pfn; 952}
how linux get maximum pfn number?
I wondered where linux get information about memory capacity in the machine. In i386 machine, we can get information from BIOS or e820 which is shorthand for BIOS function name. You can find some information from whikipedia : http://en.wikipedia.org/wiki/E820
Anyway, after checking basic memory map information, kernel call following function to get max_pfn value.
868/* 869 * Find the highest page frame number we have available 870 */ 871void __init find_max_pfn(void) 872{ 873 int i; 874 875 max_pfn = 0; 876 if (efi_enabled) { 877 efi_memmap_walk(efi_find_max_pfn, &max_pfn); 878 return; 879 } 880 881 for (i = 0; i < e820.nr_map; i++) { 882 unsigned long start, end; 883 /* RAM? */ 884 if (e820.map[i].type != E820_RAM) 885 continue; 886 start = PFN_UP(e820.map[i].addr); 887 end = PFN_DOWN(e820.map[i].addr + e820.map[i].size); 888 if (start >= end) 889 continue; 890 if (end > max_pfn) 891 max_pfn = end; 892 } 893}
Where can I find kernel cmdline parameter parsing code?
I was wondering what highmem parameter affect to the system, so I tried to dig into the kernel code. But, I had to spend so many hours to find the starting point.
This is the code, I finally found and I want to keep it here for later use.
http://lxr.linux.no/linux+v2.6.11/arch/i386/kernel/setup.c
668static void __init parse_cmdline_early (char ** cmdline_p) 669{ 670 char c = ' ', *to = command_line, *from = saved_command_line; 671 int len = 0; 672 int userdef = 0; 673 674 /* Save unparsed command line copy for /proc/cmdline */ 675 saved_command_line[COMMAND_LINE_SIZE-1] = ''; 676 677 for (;;) { 678 if (c != ' ') 679 goto next_char; 680 /* 681 * "mem=nopentium" disables the 4MB page tables. 682 * "mem=XXX[kKmM]" defines a memory region from HIGH_MEM 683 * to <mem>, overriding the bios size. 684 * "memmap=XXX[KkmM]@XXX[KkmM]" defines a memory region from 685 * <start> to <start>+<mem>, overriding the bios size. 686 * 687 * HPA tells me bootloaders need to parse mem=, so no new 688 * option should be mem= [also see Documentation/i386/boot.txt] 689 */ 690 if (!memcmp(from, "mem=", 4)) { 691 if (to != command_line) 692 to--; 693 if (!memcmp(from+4, "nopentium", 9)) { 694 from += 9+4; 695 clear_bit(X86_FEATURE_PSE, boot_cpu_data.x86_capability); 696 disable_pse = 1; 697 } else { 698 /* If the user specifies memory size, we 699 * limit the BIOS-provided memory map to 700 * that size. exactmap can be used to specify 701 * the exact map. mem=number can be used to 702 * trim the existing memory map. 703 */ 704 unsigned long long mem_size; 705 706 mem_size = memparse(from+4, &from); 707 limit_regions(mem_size); 708 userdef=1; 709 } 710 } 711 712 else if (!memcmp(from, "memmap=", 7)) { 713 if (to != command_line) 714 to--; 715 if (!memcmp(from+7, "exactmap", 8)) { 716 from += 8+7; 717 e820.nr_map = 0; 718 userdef = 1; 719 } else { 720 /* If the user specifies memory size, we 721 * limit the BIOS-provided memory map to 722 * that size. exactmap can be used to specify 723 * the exact map. mem=number can be used to 724 * trim the existing memory map. 725 */ 726 unsigned long long start_at, mem_size; 727 728 mem_size = memparse(from+7, &from); 729 if (*from == '@') { 730 start_at = memparse(from+1, &from); 731 add_memory_region(start_at, mem_size, E820_RAM); 732 } else if (*from == '#') { 733 start_at = memparse(from+1, &from); 734 add_memory_region(start_at, mem_size, E820_ACPI); 735 } else if (*from == '$') { 736 start_at = memparse(from+1, &from); 737 add_memory_region(start_at, mem_size, E820_RESERVED); 738 } else { 739 limit_regions(mem_size); 740 userdef=1; 741 } 742 } 743 } 744 745 else if (!memcmp(from, "noexec=", 7)) 746 noexec_setup(from + 7); 747 748 749#ifdef CONFIG_X86_SMP 750 /* 751 * If the BIOS enumerates physical processors before logical, 752 * maxcpus=N at enumeration-time can be used to disable HT. 753 */ 754 else if (!memcmp(from, "maxcpus=", 8)) { 755 extern unsigned int maxcpus; 756 757 maxcpus = simple_strtoul(from + 8, NULL, 0); 758 } 759#endif 760 761#ifdef CONFIG_ACPI_BOOT 762 /* "acpi=off" disables both ACPI table parsing and interpreter */ 763 else if (!memcmp(from, "acpi=off", 8)) { 764 disable_acpi(); 765 } 766 767 /* acpi=force to over-ride black-list */ 768 else if (!memcmp(from, "acpi=force", 10)) { 769 acpi_force = 1; 770 acpi_ht = 1; 771 acpi_disabled = 0; 772 } 773 774 /* acpi=strict disables out-of-spec workarounds */ 775 else if (!memcmp(from, "acpi=strict", 11)) { 776 acpi_strict = 1; 777 } 778 779 /* Limit ACPI just to boot-time to enable HT */ 780 else if (!memcmp(from, "acpi=ht", 7)) { 781 if (!acpi_force) 782 disable_acpi(); 783 acpi_ht = 1; 784 } 785 786 /* "pci=noacpi" disable ACPI IRQ routing and PCI scan */ 787 else if (!memcmp(from, "pci=noacpi", 10)) { 788 acpi_disable_pci(); 789 } 790 /* "acpi=noirq" disables ACPI interrupt routing */ 791 else if (!memcmp(from, "acpi=noirq", 10)) { 792 acpi_noirq_set(); 793 } 794 795 else if (!memcmp(from, "acpi_sci=edge", 13)) 796 acpi_sci_flags.trigger = 1; 797 798 else if (!memcmp(from, "acpi_sci=level", 14)) 799 acpi_sci_flags.trigger = 3; 800 801 else if (!memcmp(from, "acpi_sci=high", 13)) 802 acpi_sci_flags.polarity = 1; 803 804 else if (!memcmp(from, "acpi_sci=low", 12)) 805 acpi_sci_flags.polarity = 3; 806 807#ifdef CONFIG_X86_IO_APIC 808 else if (!memcmp(from, "acpi_skip_timer_override", 24)) 809 acpi_skip_timer_override = 1; 810#endif 811 812#ifdef CONFIG_X86_LOCAL_APIC 813 /* disable IO-APIC */ 814 else if (!memcmp(from, "noapic", 6)) 815 disable_ioapic_setup(); 816#endif /* CONFIG_X86_LOCAL_APIC */ 817#endif /* CONFIG_ACPI_BOOT */ 818 819 /* 820 * highmem=size forces highmem to be exactly 'size' bytes. 821 * This works even on boxes that have no highmem otherwise. 822 * This also works to reduce highmem size on bigger boxes. 823 */ 824 else if (!memcmp(from, "highmem=", 8)) 825 highmem_pages = memparse(from+8, &from) >> PAGE_SHIFT; 826 827 /* 828 * vmalloc=size forces the vmalloc area to be exactly 'size' 829 * bytes. This can be used to increase (or decrease) the 830 * vmalloc area - the default is 128m. 831 */ 832 else if (!memcmp(from, "vmalloc=", 8)) 833 __VMALLOC_RESERVE = memparse(from+8, &from); 834 835 next_char: 836 c = *(from++); 837 if (!c) 838 break; 839 if (COMMAND_LINE_SIZE <= ++len) 840 break; 841 *(to++) = c; 842 } 843 *to = ''; 844 *cmdline_p = command_line; 845 if (userdef) { 846 printk(KERN_INFO "user-defined physical RAM map:\n"); 847 print_memory_map("user"); 848 } 849}
Why free() does not return allocated physical memory to the OS?
If your application have to fight with huge amount of memory resources, you will find this unexpected result sometimes.
Even though you free()ed every memory into the Kernel, your process still holding that memory when you check with vmstat or free command.
It can be act differently among different Linux distributions and different library set. But, normally you can see this.
It’s happen because of optimization inside malloc() library. They usually does not return memory because it is possible to use this areas again. So, malloc library keept it for later use. But, if you just use those memory once and never use that again, it will be waste your memory and sometimes reduce the performance.
You can reclaim to actually freeing the memory by calling malloc_trim() API.
Meaning of the ’sysctl.max_lock_depth’
This tunable limits the amount of deadlock-checking the kernel will do. The default value is 1024.
You can see those codes in the following snippet.
146/*
147 * Max number of times we'll walk the boosting chain:
148 */
149int max_lock_depth = 1024;
150
151/*
152 * Adjust the priority chain. Also used for deadlock detection.
153 * Decreases task's usage by one - may thus free the task.
154 * Returns 0 or -EDEADLK.
155 */
156static int rt_mutex_adjust_prio_chain(struct task_struct *task,
157 int deadlock_detect,
158 struct rt_mutex *orig_lock,
159 struct rt_mutex_waiter *orig_waiter,
160 struct task_struct *top_task)
161{
162 struct rt_mutex *lock;
163 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
164 int detect_deadlock, ret = 0, depth = 0;
165 unsigned long flags;
166
167 detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
168 deadlock_detect);
169
170 /*
171 * The (de)boosting is a step by step approach with a lot of
172 * pitfalls. We want this to be preemptible and we want hold a
173 * maximum of two locks per step. So we have to check
174 * carefully whether things change under us.
175 */
176 again:
177 if (++depth > max_lock_depth) {
178 static int prev_max;
179
180 /*
181 * Print this only once. If the admin changes the limit,
182 * print a new message when reaching the limit again.
183 */
184 if (prev_max != max_lock_depth) {
185 prev_max = max_lock_depth;
186 printk(KERN_WARNING "Maximum lock depth %d reached "
187 "task: %s (%d)\n", max_lock_depth,
188 top_task->comm, task_pid_nr(top_task));
189 }
190 put_task_struct(task);
191
192 return deadlock_detect ? -EDEADLK : 0;
193 }
194 retry:
195 /*
Are malloc() and free() functions thread-safe?
Recently, someone asked me why he can’t allocated enough memory with a lot of thread functions.
At first, I just tried to explain that there is no problem in Linux kernel or malloc() functions. But, when double checked his code, I found that he used malloc() without any safeguard. Yeah, no synchronization around malloc(). Sigh.
malloc() and free() functions are use static data structures which can cause the synchronization problem in multi-threaded application. They are not thread-safe. You will be better to make your own function to allocate and deallocate for memory and put some synchronization code into that functions. That will be helpful.
There are some web documents that talk about malloc() and free(). One of them is this: http://www.ibm.com/developerworks/linux/library/l-reent.html
Additional Note: In RHEL products, they use ptmalloc() which is thread-safe version of malloc(). So, it does not make any problem within thread application by itself.
Meaning of /proc/stat
What are the meanings of each field in /proc/stat file?
Some documents said that each field represented in jiffies, but actually it isn’t. It was true on kernel 2.4, but no more in kernel 2.6. We can easily see the meaning of each field at the kernel source.
http://lxr.linux.no/linux+v2.6.27.10/fs/proc/proc_misc.c
static int show_stat(struct seq_file *p, void *v)
504{
505 int i;
506 unsigned long jif;
507 cputime64_t user, nice, system, idle, iowait, irq, softirq, steal;
508 cputime64_t guest;
509 u64 sum = 0;
510 struct timespec boottime;
511 unsigned int *per_irq_sum;
512
513 per_irq_sum = kzalloc(sizeof(unsigned int)*NR_IRQS, GFP_KERNEL);
514 if (!per_irq_sum)
515 return -ENOMEM;
516
517 user = nice = system = idle = iowait =
518 irq = softirq = steal = cputime64_zero;
519 guest = cputime64_zero;
520 getboottime(&boottime);
521 jif = boottime.tv_sec;
522
523 for_each_possible_cpu(i) {
524 int j;
525
526 user = cputime64_add(user, kstat_cpu(i).cpustat.user);
527 nice = cputime64_add(nice, kstat_cpu(i).cpustat.nice);
528 system = cputime64_add(system, kstat_cpu(i).cpustat.system);
529 idle = cputime64_add(idle, kstat_cpu(i).cpustat.idle);
530 iowait = cputime64_add(iowait, kstat_cpu(i).cpustat.iowait);
531 irq = cputime64_add(irq, kstat_cpu(i).cpustat.irq);
532 softirq = cputime64_add(softirq, kstat_cpu(i).cpustat.softirq);
533 steal = cputime64_add(steal, kstat_cpu(i).cpustat.steal);
534 guest = cputime64_add(guest, kstat_cpu(i).cpustat.guest);
535 for (j = 0; j < NR_IRQS; j++) {
536 unsigned int temp = kstat_cpu(i).irqs[j];
537 sum += temp;
538 per_irq_sum[j] += temp;
539 }
540 sum += arch_irq_stat_cpu(i);
541 }
542 sum += arch_irq_stat();
543
544 seq_printf(p, "cpu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
545 (unsigned long long)cputime64_to_clock_t(user),
546 (unsigned long long)cputime64_to_clock_t(nice),
547 (unsigned long long)cputime64_to_clock_t(system),
548 (unsigned long long)cputime64_to_clock_t(idle),
549 (unsigned long long)cputime64_to_clock_t(iowait),
550 (unsigned long long)cputime64_to_clock_t(irq),
551 (unsigned long long)cputime64_to_clock_t(softirq),
552 (unsigned long long)cputime64_to_clock_t(steal),
553 (unsigned long long)cputime64_to_clock_t(guest));
554 for_each_online_cpu(i) {
555
556 /* Copy values here to work around gcc-2.95.3, gcc-2.96 */
557 user = kstat_cpu(i).cpustat.user;
558 nice = kstat_cpu(i).cpustat.nice;
559 system = kstat_cpu(i).cpustat.system;
560 idle = kstat_cpu(i).cpustat.idle;
561 iowait = kstat_cpu(i).cpustat.iowait;
562 irq = kstat_cpu(i).cpustat.irq;
563 softirq = kstat_cpu(i).cpustat.softirq;
564 steal = kstat_cpu(i).cpustat.steal;
565 guest = kstat_cpu(i).cpustat.guest;
566 seq_printf(p,
567 "cpu%d %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
568 i,
569 (unsigned long long)cputime64_to_clock_t(user),
570 (unsigned long long)cputime64_to_clock_t(nice),
571 (unsigned long long)cputime64_to_clock_t(system),
572 (unsigned long long)cputime64_to_clock_t(idle),
573 (unsigned long long)cputime64_to_clock_t(iowait),
574 (unsigned long long)cputime64_to_clock_t(irq),
575 (unsigned long long)cputime64_to_clock_t(softirq),
576 (unsigned long long)cputime64_to_clock_t(steal),
577 (unsigned long long)cputime64_to_clock_t(guest));
578 }
579 seq_printf(p, "intr %llu", (unsigned long long)sum);
580
581 for (i = 0; i < NR_IRQS; i++)
582 seq_printf(p, " %u", per_irq_sum[i]);
583
584 seq_printf(p,
585 "\nctxt %llu\n"
586 "btime %lu\n"
587 "processes %lu\n"
588 "procs_running %lu\n"
589 "procs_blocked %lu\n",
590 nr_context_switches(),
591 (unsigned long)jif,
592 total_forks,
593 nr_running(),
594 nr_iowait());
595
596 kfree(per_irq_sum);
597 return 0;
598}
From line 544 to 553, you can see the meaning of each field.
One little strange part is that the value which is calculated during the timer interrupt is converted to clock_t type using cputime64_to_clock_t() macro.
http://lxr.linux.no/linux+v2.6.27.10/include/asm-generic/cputime.h#L67
/* 65 * Convert cputime64 to clock. 66 */ 67#define cputime64_to_clock_t(__ct) jiffies_64_to_clock_t(__ct) 68
http://lxr.linux.no/linux+v2.6.27.10/kernel/time.c#L618
618u64 jiffies_64_to_clock_t(u64 x)
619{
620#if (TICK_NSEC % (NSEC_PER_SEC / USER_HZ)) == 0
621# if HZ USER_HZ
624 x = div_u64(x, HZ / USER_HZ);
625# else
626 /* Nothing to do */
627# endif
628#else
629 /*
630 * There are better ways that don't overflow early,
631 * but even this doesn't overflow in hundreds of years
632 * in 64 bits, so..
633 */
634 x = div_u64(x * TICK_NSEC, (NSEC_PER_SEC / USER_HZ));
635#endif
636 return x;
637}
638EXPORT_SYMBOL(jiffies_64_to_clock_t);
At the end, the internal jiffies value divided by difference between HZ and USER_HZ. The USER_HZ is 100. If HZ is 1000, internal jiffies value will be divided by 10. It means that we see the same result among 2.4 and 2.6, but it does not mean that those values just come from jiffies.
If we want to see the values in seconds, we can divide the value by CLOCKS_PER_SEC macro which is defined in time.h.
Unix Top command
If you are unix guys, you know well about ‘top’ command which display very useful information about the system usage. I also used this a lot during check the system states. But, in recent, one of my customer asked me about the difference between ‘CPU states:’ field and each process’s cpu usage. He asked why it is not matched.
I tried to figure it out by check the source code, but it’s still not clear. So, I googled about ‘top’ command. Because of too simple word ‘top’, it was not easy to find the correct documentation. I struggled a few days.

But, finally I found the original site, ‘http://www.unixtop.org/‘.
They explained about this with very simple but clear way. If you have any interest about this field, you can check this web site’s ‘Documentation’ section.
What happened in my stack?
If you write a program which requires a big chunk of memory at the time, you will notice the influences of ’stack size’.
I will show you what will happen in this stack limitation with the following simple program.
#include <stdio.h>
#include <malloc.h>
int main() {
char *a;
unsigned long i, half_gb = 512 * 1024 * 1024;
unsigned long max = (unsigned long)3 * 1024 * 1024 * 1024;
printf("PID = %d\n\n", getpid());
for (i = half_gb; i < max; i += half_gb) {
a = malloc(i);
printf("a = %p, size = %uMB\n", a, i / (1024 * 1024));
if (a != NULL) {
fgets(a, i, stdin);
free(a);
}
}
return 0;
}
In Linux, you can change the stack size by ‘ulimit -s’ command. As a default, it has 10240 (10MB).
[root@localhost ~]# ulimit -a core file size (blocks, -c) 0 data seg size (kbytes, -d) unlimited scheduling priority (-e) 0 file size (blocks, -f) unlimited pending signals (-i) 32768 max locked memory (kbytes, -l) 32 max memory size (kbytes, -m) unlimited open files (-n) 1024 pipe size (512 bytes, -p) 8 POSIX message queues (bytes, -q) 819200 real-time priority (-r) 0 stack size (kbytes, -s) 10240 cpu time (seconds, -t) unlimited max user processes (-u) 32768 virtual memory (kbytes, -v) unlimited file locks (-x) unlimited
As you can see, the stack size is ‘10240kbytes’. We can change it with ‘-s’ option. First, without changing anything I will run the above application.
[root@localhost ~]# ./test PID = 7865 a = 0x97f9e008, size = 512MB a = 0x77f9e008, size = 1024MB a = 0x57f9e008, size = 1536MB a = 0x37f9e008, size = 2048MB a = 0x17f9e008, size = 2560MB
And the memory layout will be something like this:
[root@localhost ~]# pmap -x 7865 7865: ./test Address Kbytes RSS Anon Locked Mode Mapping 004c9000 104 - - - r-x-- ld-2.5.so 004e3000 4 - - - r-x-- ld-2.5.so 004e4000 4 - - - rwx-- ld-2.5.so 004e7000 1268 - - - r-x-- libc-2.5.so 00624000 8 - - - r-x-- libc-2.5.so 00626000 4 - - - rwx-- libc-2.5.so 00627000 12 - - - rwx-- [ anon ] 00f43000 4 - - - r-x-- [ anon ] 08048000 4 - - - r-x-- test 08049000 4 - - - rw--- test 57f9e000 1572876 - - - rw--- [ anon ] b7fae000 8 - - - rw--- [ anon ] bfaac000 84 - - - rw--- [ stack ] -------- ------- ------- ------- ------- total kB 1574384 - - -
You can see the malloc() memory started from the 77fc9000. It looks normal and looks nothing special in this layout. But, you will find something difference soon.
If you change the stack size with something like this:
[root@localhost ~]# ulimit -s unlimited [root@localhost ~]# ulimit -a core file size (blocks, -c) 0 data seg size (kbytes, -d) unlimited scheduling priority (-e) 0 file size (blocks, -f) unlimited pending signals (-i) 32768 max locked memory (kbytes, -l) 32 max memory size (kbytes, -m) unlimited open files (-n) 1024 pipe size (512 bytes, -p) 8 POSIX message queues (bytes, -q) 819200 real-time priority (-r) 0 stack size (kbytes, -s) unlimited cpu time (seconds, -t) unlimited max user processes (-u) 32768 virtual memory (kbytes, -v) unlimited file locks (-x) unlimited
And run the same application again.
[root@localhost ~]# ./test PID = 7791 a = 0x40012008, size = 512MB a = 0x40012008, size = 1024MB a = 0x40012008, size = 1536MB a = (nil), size = 2048MB a = (nil), size = 2560MB
It failed when I request 2G of memory which was succeed in the previous test. When I saw the memory layout, it was different.
[root@localhost ~]# pmap -x 7791 7791: ./test Address Kbytes RSS Anon Locked Mode Mapping 004c9000 104 - - - r-x-- ld-2.5.so 004e3000 4 - - - r-x-- ld-2.5.so 004e4000 4 - - - rwx-- ld-2.5.so 004e7000 1268 - - - r-x-- libc-2.5.so 00624000 8 - - - r-x-- libc-2.5.so 00626000 4 - - - rwx-- libc-2.5.so 00627000 12 - - - rwx-- [ anon ] 08048000 4 - - - r-x-- test 08049000 4 - - - rwx-- test 40000000 4 - - - r-x-- [ anon ] 40001000 8 - - - rw--- [ anon ] 40010000 1572876 - - - rw--- [ anon ] bf87a000 88 - - - rw--- [ stack ] -------- ------- ------- ------- ------- total kB 1574388 - - -
The main difference is that with ‘unlimited’ option, we can see much more allocated memory junk. You can see who use the other memory junks except the one allocated by malloc. As you can see in the following code block, the main reason is ‘[vdso]‘. Because it is located at the fixed location, every memory allocation had to start later on. So, possible memory allocation must done after that address. It reduces the available memory sizes.
40000000-40001000 r-xp 40000000 00:00 0 [vdso] 40001000-40003000 rw-p 40001000 00:00 0 40010000-80013000 rw-p 40010000 00:00 0 bffde000-bfff3000 rw-p bffde000 00:00 0 [stack]
The main problem is that ‘[vdso]‘ inserted into the middle of the heap when stack size is set to ‘unlimited’. [vdso] is the memory space which is used for fast system call mechanism. With this memory area, we don’t need to call interrupt, just access those memory range. So, it is much faster than interrupt call.
I tried to find out the reason why this location becomes different on different stack size. And I could found it on the arch_pick_mmap_layout(..) function.
33/*
34 * Top of mmap area (just below the process stack).
35 *
36 * Leave an at least ~128 MB hole.
37 */
38#define MIN_GAP (128*1024*1024)
39#define MAX_GAP (TASK_SIZE/6*5)
40
41/*
42 * True on X86_32 or when emulating IA32 on X86_64
43 */
44static int mmap_is_ia32(void)
45{
46#ifdef CONFIG_X86_32
47 return 1;
48#endif
49#ifdef CONFIG_IA32_EMULATION
50 if (test_thread_flag(TIF_IA32))
51 return 1;
52#endif
53 return 0;
54}
55
56static int mmap_is_legacy(void)
57{
58 if (current->personality & ADDR_COMPAT_LAYOUT)
59 return 1;
60
61 if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY)
62 return 1;
63
64 return sysctl_legacy_va_layout;
65}
66
67static unsigned long mmap_rnd(void)
68{
69 unsigned long rnd = 0;
70
71 /*
72 * 8 bits of randomness in 32bit mmaps, 20 address space bits
73 * 28 bits of randomness in 64bit mmaps, 40 address space bits
74 */
75 if (current->flags & PF_RANDOMIZE) {
76 if (mmap_is_ia32())
77 rnd = (long)get_random_int() % (1<<8);
78 else
79 rnd = (long)(get_random_int() % (1<<28));
80 }
81 return rnd <signal->rlim[RLIMIT_STACK].rlim_cur;
87
88 if (gap MAX_GAP)
91 gap = MAX_GAP;
92
93 return PAGE_ALIGN(TASK_SIZE - gap - mmap_rnd());
94}
95
96/*
97 * Bottom-up (legacy) layout on X86_32 did not support randomization, X86_64
98 * does, but not when emulating X86_32
99 */
100static unsigned long mmap_legacy_base(void)
101{
102 if (mmap_is_ia32())
103 return TASK_UNMAPPED_BASE;
104 else
105 return TASK_UNMAPPED_BASE + mmap_rnd();
106}
107
108/*
109 * This function, called very early during the creation of a new
110 * process VM image, sets up which VM layout function to use:
111 */
112void arch_pick_mmap_layout(struct mm_struct *mm)
113{
114 if (mmap_is_legacy()) {
115 mm->mmap_base = mmap_legacy_base();
116 mm->get_unmapped_area = arch_get_unmapped_area;
117 mm->unmap_area = arch_unmap_area;
118 } else {
119 mm->mmap_base = mmap_base();
120 mm->get_unmapped_area = arch_get_unmapped_area_topdown;
121 mm->unmap_area = arch_unmap_area_topdown;
122 }
123}
124
mmap_is_legacy() will return TRUE if stack size is set to unlimited (or some other reasons which will not explain on here). If mmap_is_leagacy() return TRUE, mm->mmap_base is set to TASK_UNMAPPED_BASE(1GB) in i386 box. This is the start address of heap and vdso just located on here. So, problem(?) happens.